From 91e9fa4f8518f21c67917f66bc83fb14bc71c80f Mon Sep 17 00:00:00 2001 From: GNU Libc Maintainers Date: Sun, 28 Dec 2025 17:32:33 +0100 Subject: [PATCH] git-updates GIT update of https://sourceware.org/git/glibc.git/release/2.41/master from glibc-2.41 GIT update of https://sourceware.org/git/glibc.git/release/2.41/master from glibc-2.41 Gbp-Pq: Name git-updates.diff --- ADVISORIES | 2 + Makeconfig | 2 +- NEWS | 33 ++ advisories/GLIBC-SA-2023-0001 | 14 - advisories/GLIBC-SA-2023-0002 | 15 - advisories/GLIBC-SA-2023-0003 | 15 - advisories/GLIBC-SA-2023-0004 | 16 - advisories/GLIBC-SA-2023-0005 | 18 - advisories/GLIBC-SA-2024-0001 | 15 - advisories/GLIBC-SA-2024-0002 | 15 - advisories/GLIBC-SA-2024-0003 | 13 - advisories/GLIBC-SA-2024-0004 | 28 -- advisories/GLIBC-SA-2024-0005 | 22 - advisories/GLIBC-SA-2024-0006 | 32 -- advisories/GLIBC-SA-2024-0007 | 28 -- advisories/GLIBC-SA-2024-0008 | 26 -- advisories/GLIBC-SA-2025-0001 | 25 -- advisories/README | 77 ---- assert/Makefile | 1 + assert/tst-assert-sa-2025-0001.c | 92 +++++ benchtests/atanh-inputs | 1 + benchtests/sinh-inputs | 1 + config.make.in | 1 + configure | 162 ++++++++ configure.ac | 32 ++ elf/Makefile | 45 +- elf/dl-execstack-tunable.c | 39 ++ elf/dl-execstack.c | 2 +- elf/dl-find_object.c | 79 ++-- elf/dl-find_object.h | 4 +- elf/dl-load.c | 4 +- elf/dl-reloc-static-pie.c | 3 +- elf/dl-support.c | 4 +- elf/dl-tls.c | 7 + elf/dl-tunables.list | 2 +- elf/rtld.c | 88 ++-- elf/tst-audit-tlsdesc-dlopen2.c | 46 +++ elf/tst-auditmod-tlsdesc2.c | 59 +++ elf/tst-dlopen-sgid-mod.c | 1 + elf/tst-dlopen-sgid.c | 106 +++++ elf/tst-env-setuid-tunables.c | 18 +- elf/tst-env-setuid.c | 17 +- elf/tst-execstack-prog-static-tunable.c | 1 + elf/tst-execstack-tunable.c | 1 + elf/tst-link-map-contiguous-ldso.c | 98 +++++ elf/tst-link-map-contiguous-libc.c | 57 +++ elf/tst-link-map-contiguous-main.c | 45 ++ elf/tst-pie-bss-static.c | 19 + .../strcmp-power10.S => elf/tst-pie-bss.c | 20 +- elf/tst-rtld-list-tunables.exp | 2 +- iconv/iconv_prog.c | 4 +- iconv/tst-iconv_prog-buffer.sh | 4 + include/dlfcn.h | 3 +- math/auto-libm-test-in | 4 + math/auto-libm-test-out-log10p1 | 25 ++ math/auto-libm-test-out-sinh | 25 ++ math/auto-libm-test-out-tan | 25 ++ math/bits/mathcalls-macros.h | 2 +- nptl/Makefile | 3 + nptl/cancellation.c | 4 +- nptl/pthread_cancel.c | 14 +- nptl/pthread_getattr_np.c | 4 +- posix/Makefile | 1 + posix/environ.c | 4 + posix/regcomp.c | 4 +- posix/tst-regcomp-bracket-free.c | 176 ++++++++ stdlib/Makefile | 2 + stdlib/abort.c | 6 +- stdlib/getenv.c | 3 - stdlib/tst-getenv-static.c | 38 ++ stdlib/tst-secure-getenv.c | 9 +- support/capture_subprocess.h | 11 +- support/support_capture_subprocess.c | 166 ++++---- sysdeps/aarch64/fpu/acos_advsimd.c | 56 ++- sysdeps/aarch64/fpu/acos_sve.c | 75 ++-- sysdeps/aarch64/fpu/acosh_sve.c | 6 +- sysdeps/aarch64/fpu/asin_advsimd.c | 46 ++- sysdeps/aarch64/fpu/asin_sve.c | 74 ++-- sysdeps/aarch64/fpu/asinf_advsimd.c | 31 +- sysdeps/aarch64/fpu/asinh_sve.c | 111 +++-- sysdeps/aarch64/fpu/atan2_advsimd.c | 128 +++--- sysdeps/aarch64/fpu/atan2_sve.c | 107 +++-- sysdeps/aarch64/fpu/atan2f_advsimd.c | 64 +-- sysdeps/aarch64/fpu/atan2f_sve.c | 61 +-- sysdeps/aarch64/fpu/atan_advsimd.c | 83 ++-- sysdeps/aarch64/fpu/atan_sve.c | 104 +++-- sysdeps/aarch64/fpu/atanf_advsimd.c | 97 +++-- sysdeps/aarch64/fpu/atanf_sve.c | 84 ++-- sysdeps/aarch64/fpu/atanh_sve.c | 3 +- sysdeps/aarch64/fpu/cosh_sve.c | 135 +++--- sysdeps/aarch64/fpu/coshf_sve.c | 6 +- sysdeps/aarch64/fpu/erfcf_sve.c | 12 +- sysdeps/aarch64/fpu/exp10_sve.c | 25 +- sysdeps/aarch64/fpu/exp10f_sve.c | 53 +-- sysdeps/aarch64/fpu/exp2_sve.c | 76 ++-- sysdeps/aarch64/fpu/exp2f_sve.c | 35 +- sysdeps/aarch64/fpu/exp_sve.c | 36 +- sysdeps/aarch64/fpu/expf_sve.c | 6 +- sysdeps/aarch64/fpu/expm1_sve.c | 202 ++++++--- sysdeps/aarch64/fpu/log1p_sve.c | 84 +++- sysdeps/aarch64/fpu/pow_sve.c | 245 ++++++----- sysdeps/aarch64/fpu/powf_sve.c | 117 +++--- sysdeps/aarch64/fpu/sinh_sve.c | 165 +++++--- sysdeps/aarch64/fpu/sv_expf_inline.h | 31 +- sysdeps/aarch64/fpu/sv_log1p_inline.h | 86 +++- sysdeps/aarch64/fpu/tanh_sve.c | 154 ++++--- sysdeps/aarch64/multiarch/Makefile | 1 + sysdeps/aarch64/multiarch/ifunc-impl-list.c | 1 + sysdeps/aarch64/multiarch/memset.c | 4 + sysdeps/aarch64/multiarch/memset_sve_zva64.S | 123 ++++++ sysdeps/arm/find_exidx.c | 3 +- sysdeps/generic/ldsodefs.h | 15 +- sysdeps/ieee754/dbl-64/e_atanh.c | 8 + sysdeps/ieee754/dbl-64/e_sinh.c | 8 + sysdeps/ieee754/dbl-64/math_config.h | 6 +- sysdeps/ieee754/dbl-64/s_fma.c | 3 + sysdeps/ieee754/dbl-64/s_tanh.c | 5 + sysdeps/ieee754/flt-32/e_sinhf.c | 2 +- sysdeps/ieee754/flt-32/s_log10p1f.c | 2 +- sysdeps/ieee754/flt-32/s_tanf.c | 2 +- sysdeps/mach/hurd/dl-execstack.c | 5 +- sysdeps/nptl/bits/thread-shared-types.h | 2 + sysdeps/nptl/dl-tls_init_tp.c | 1 + sysdeps/nptl/pthread.h | 2 +- sysdeps/powerpc/powerpc64/le/power10/memchr.S | 315 -------------- sysdeps/powerpc/powerpc64/le/power10/strcmp.S | 233 ----------- .../powerpc/powerpc64/le/power10/strncmp.S | 271 ------------ sysdeps/powerpc/powerpc64/multiarch/Makefile | 11 +- .../powerpc64/multiarch/ifunc-impl-list.c | 13 - sysdeps/powerpc/powerpc64/multiarch/memchr.c | 20 +- sysdeps/powerpc/powerpc64/multiarch/strcmp.c | 4 - sysdeps/powerpc/powerpc64/multiarch/strncmp.c | 4 - sysdeps/pthread/Makefile | 11 + sysdeps/pthread/tst-cancel32.c | 73 ++++ sysdeps/pthread/tst-stack2-mod.c | 39 ++ sysdeps/pthread/tst-stack2.c | 47 +++ sysdeps/riscv/dl-machine.h | 17 +- sysdeps/sparc/sparc32/start.S | 11 +- sysdeps/sparc/sparc64/start.S | 4 + sysdeps/unix/sysv/linux/aarch64/Makefile | 129 +++++- .../unix/sysv/linux/aarch64/cpu-features.c | 1 + .../sysv/linux/aarch64/tst-aarch64-pkey.c | 4 + .../unix/sysv/linux/aarch64/tst-gcs-abort.sh | 39 ++ .../linux/aarch64/tst-gcs-disabled-static.c | 1 + .../sysv/linux/aarch64/tst-gcs-disabled.c | 2 + .../linux/aarch64/tst-gcs-dlopen-disabled.c | 3 + .../linux/aarch64/tst-gcs-dlopen-enforced.c | 3 + .../aarch64/tst-gcs-dlopen-optional-off.c | 3 + .../aarch64/tst-gcs-dlopen-optional-on.c | 3 + .../linux/aarch64/tst-gcs-dlopen-override.c | 3 + .../unix/sysv/linux/aarch64/tst-gcs-dlopen.c | 62 +++ .../linux/aarch64/tst-gcs-enforced-abort.c | 2 + .../aarch64/tst-gcs-enforced-static-abort.c | 1 + .../linux/aarch64/tst-gcs-enforced-static.c | 1 + .../sysv/linux/aarch64/tst-gcs-enforced.c | 2 + .../unix/sysv/linux/aarch64/tst-gcs-helper.h | 39 ++ .../sysv/linux/aarch64/tst-gcs-mod1.c} | 16 +- .../sysv/linux/aarch64/tst-gcs-mod2.c} | 19 +- .../unix/sysv/linux/aarch64/tst-gcs-mod3.c | 25 ++ .../sysv/linux/aarch64/tst-gcs-noreturn.c | 101 +++++ .../sysv/linux/aarch64/tst-gcs-optional-off.c | 2 + .../sysv/linux/aarch64/tst-gcs-optional-on.c | 2 + .../aarch64/tst-gcs-optional-static-off.c | 1 + .../aarch64/tst-gcs-optional-static-on.c | 1 + .../linux/aarch64/tst-gcs-override-static.c | 1 + .../sysv/linux/aarch64/tst-gcs-override.c | 2 + .../linux/aarch64/tst-gcs-shared-disabled.c | 2 + .../aarch64/tst-gcs-shared-enforced-abort.c | 2 + .../linux/aarch64/tst-gcs-shared-optional.c | 2 + .../linux/aarch64/tst-gcs-shared-override.c | 2 + .../unix/sysv/linux/aarch64/tst-gcs-shared.c | 41 ++ .../sysv/linux/aarch64/tst-gcs-skeleton.c | 43 ++ sysdeps/unix/sysv/linux/bits/sched.h | 2 +- sysdeps/unix/sysv/linux/dl-execstack.c | 7 +- sysdeps/unix/sysv/linux/rseq-internal.h | 11 +- sysdeps/x86/Makefile | 22 + sysdeps/x86/bits/floatn.h | 10 +- sysdeps/x86/cpu-features.c | 389 +++++++++--------- sysdeps/x86/cpu-tunables.c | 2 + sysdeps/x86/dl-diagnostics-cpu.c | 2 + sysdeps/x86/include/cpu-features.h | 9 +- sysdeps/x86/sysdep.h | 6 + sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c | 1 + sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c | 1 + sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c | 1 + sysdeps/x86_64/Makefile | 1 - sysdeps/x86_64/dl-tlsdesc-dynamic.h | 2 +- sysdeps/x86_64/fpu/multiarch/Makefile | 6 + sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c | 6 + sysdeps/x86_64/fpu/multiarch/e_atanh.c | 34 ++ sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c | 12 + sysdeps/x86_64/fpu/multiarch/e_sinh.c | 35 ++ sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c | 11 + sysdeps/x86_64/fpu/multiarch/s_tanh.c | 31 ++ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 +- sysdeps/x86_64/tst-auditmod10b.c | 109 ++--- 196 files changed, 4334 insertions(+), 2794 deletions(-) create mode 100644 ADVISORIES delete mode 100644 advisories/GLIBC-SA-2023-0001 delete mode 100644 advisories/GLIBC-SA-2023-0002 delete mode 100644 advisories/GLIBC-SA-2023-0003 delete mode 100644 advisories/GLIBC-SA-2023-0004 delete mode 100644 advisories/GLIBC-SA-2023-0005 delete mode 100644 advisories/GLIBC-SA-2024-0001 delete mode 100644 advisories/GLIBC-SA-2024-0002 delete mode 100644 advisories/GLIBC-SA-2024-0003 delete mode 100644 advisories/GLIBC-SA-2024-0004 delete mode 100644 advisories/GLIBC-SA-2024-0005 delete mode 100644 advisories/GLIBC-SA-2024-0006 delete mode 100644 advisories/GLIBC-SA-2024-0007 delete mode 100644 advisories/GLIBC-SA-2024-0008 delete mode 100644 advisories/GLIBC-SA-2025-0001 delete mode 100644 advisories/README create mode 100644 assert/tst-assert-sa-2025-0001.c create mode 100644 elf/dl-execstack-tunable.c create mode 100644 elf/tst-audit-tlsdesc-dlopen2.c create mode 100644 elf/tst-auditmod-tlsdesc2.c create mode 100644 elf/tst-dlopen-sgid-mod.c create mode 100644 elf/tst-dlopen-sgid.c create mode 100644 elf/tst-execstack-prog-static-tunable.c create mode 100644 elf/tst-execstack-tunable.c create mode 100644 elf/tst-link-map-contiguous-ldso.c create mode 100644 elf/tst-link-map-contiguous-libc.c create mode 100644 elf/tst-link-map-contiguous-main.c create mode 100644 elf/tst-pie-bss-static.c rename sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S => elf/tst-pie-bss.c (66%) create mode 100644 posix/tst-regcomp-bracket-free.c create mode 100644 stdlib/tst-getenv-static.c create mode 100644 sysdeps/aarch64/multiarch/memset_sve_zva64.S delete mode 100644 sysdeps/powerpc/powerpc64/le/power10/memchr.S delete mode 100644 sysdeps/powerpc/powerpc64/le/power10/strcmp.S delete mode 100644 sysdeps/powerpc/powerpc64/le/power10/strncmp.S create mode 100644 sysdeps/pthread/tst-cancel32.c create mode 100644 sysdeps/pthread/tst-stack2-mod.c create mode 100644 sysdeps/pthread/tst-stack2.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-abort.sh create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled-static.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-disabled.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-enforced.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-off.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-on.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-override.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-abort.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static-abort.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-helper.h rename sysdeps/{powerpc/powerpc64/multiarch/strncmp-power10.S => unix/sysv/linux/aarch64/tst-gcs-mod1.c} (72%) rename sysdeps/{powerpc/powerpc64/multiarch/memchr-power10.S => unix/sysv/linux/aarch64/tst-gcs-mod2.c} (66%) create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod3.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-noreturn.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-off.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-on.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-off.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-on.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-override-static.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-override.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-disabled.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-enforced-abort.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-optional.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-override.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-skeleton.c create mode 100644 sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c create mode 100644 sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c create mode 100644 sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c create mode 100644 sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c create mode 100644 sysdeps/x86_64/fpu/multiarch/e_atanh.c create mode 100644 sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c create mode 100644 sysdeps/x86_64/fpu/multiarch/e_sinh.c create mode 100644 sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c create mode 100644 sysdeps/x86_64/fpu/multiarch/s_tanh.c diff --git a/ADVISORIES b/ADVISORIES new file mode 100644 index 000000000..d4e33f2df --- /dev/null +++ b/ADVISORIES @@ -0,0 +1,2 @@ +For the GNU C Library Security Advisories, see the git master branch: +https://sourceware.org/git/?p=glibc.git;a=tree;f=advisories;hb=HEAD diff --git a/Makeconfig b/Makeconfig index d0108d2ca..aa547a443 100644 --- a/Makeconfig +++ b/Makeconfig @@ -633,7 +633,7 @@ link-libc-printers-tests = $(link-libc-rpath) \ $(link-libc-tests-after-rpath-link) # This is how to find at build-time things that will be installed there. -rpath-dirs = math elf dlfcn nss nis rt resolv mathvec support +rpath-dirs = math elf dlfcn nss nis rt resolv mathvec support misc rpath-link = \ $(common-objdir):$(subst $(empty) ,:,$(patsubst ../$(subdir),.,$(rpath-dirs:%=$(common-objpfx)%))) else # build-static diff --git a/NEWS b/NEWS index b11422b06..f77d1471c 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,39 @@ See the end for copying conditions. Please send GNU C library bug reports via using `glibc' in the "product" field. +Version 2.41.1 + +Deprecated and removed features, and other changes affecting compatibility: + +* The glibc.rtld.execstack now supports a compatibility mode to allow + programs that require an executable stack through dynamic loaded + shared libraries. + +The following bugs were resolved with this release: + + [31943] _dl_find_object can fail if ld.so contains gaps between load segments + [32269] RISC-V IFUNC resolver cannot access gp pointer + [32626] math: math: log10p1f is not correctly rounded + [32627] math: math: sinhf is not correctly rounded + [32630] math: math: tanf is not correctly rounded for all rounding + modes + [32653] dynamic-link: Review options for improving both security and + backwards compatibility of glibc 2.41 dlopen / execstack handling + [32781] Linux: Remove attribute access from sched_getattr + [32782] nptl: Race conditions in pthread cancellation causing crash + [32786] nptl: PTHREAD_COND_INITIALIZER compatibility with pre-2.41 versions + [32810] Crash on x86-64 if XSAVEC disable via tunable + [32882] tst-audit10 fails with SIGILL on CPUs without AVX + [32897] dynamic-link: pthread_getattr_np fails when executable stack + tunable is set + [32981] ports: elf/tst-execstack-prog-static-tunable fails on + sparc64-linux-gnu + [32987] elf: Fix subprocess status handling for tst-dlopen-sgid + [32994] stdlib: resolve a double lock init issue after fork + [33164] iconv -o should not create executable files + [33185] Fix double-free after allocation failure in regcomp + [33245] nptl: nptl: error in internal cancellation syscall handling + Version 2.41 Major new features: diff --git a/advisories/GLIBC-SA-2023-0001 b/advisories/GLIBC-SA-2023-0001 deleted file mode 100644 index 3d19c91b6..000000000 --- a/advisories/GLIBC-SA-2023-0001 +++ /dev/null @@ -1,14 +0,0 @@ -printf: incorrect output for integers with thousands separator and width field - -When the printf family of functions is called with a format specifier -that uses an (enable grouping) and a minimum width -specifier, the resulting output could be larger than reasonably expected -by a caller that computed a tight bound on the buffer size. The -resulting larger than expected output could result in a buffer overflow -in the printf family of functions. - -CVE-Id: CVE-2023-25139 -Public-Date: 2023-02-02 -Vulnerable-Commit: e88b9f0e5cc50cab57a299dc7efe1a4eb385161d (2.37) -Fix-Commit: c980549cc6a1c03c23cc2fe3e7b0fe626a0364b0 (2.38) -Fix-Commit: 07b9521fc6369d000216b96562ff7c0ed32a16c4 (2.37-4) diff --git a/advisories/GLIBC-SA-2023-0002 b/advisories/GLIBC-SA-2023-0002 deleted file mode 100644 index 5122669a6..000000000 --- a/advisories/GLIBC-SA-2023-0002 +++ /dev/null @@ -1,15 +0,0 @@ -getaddrinfo: Stack read overflow in no-aaaa mode - -If the system is configured in no-aaaa mode via /etc/resolv.conf, -getaddrinfo is called for the AF_UNSPEC address family, and a DNS -response is received over TCP that is larger than 2048 bytes, -getaddrinfo may potentially disclose stack contents via the returned -address data, or crash. - -CVE-Id: CVE-2023-4527 -Public-Date: 2023-09-12 -Vulnerable-Commit: f282cdbe7f436c75864e5640a409a10485e9abb2 (2.36) -Fix-Commit: bd77dd7e73e3530203be1c52c8a29d08270cb25d (2.39) -Fix-Commit: 4ea972b7edd7e36610e8cde18bf7a8149d7bac4f (2.36-113) -Fix-Commit: b7529346025a130fee483d42178b5c118da971bb (2.37-38) -Fix-Commit: b25508dd774b617f99419bdc3cf2ace4560cd2d6 (2.38-19) diff --git a/advisories/GLIBC-SA-2023-0003 b/advisories/GLIBC-SA-2023-0003 deleted file mode 100644 index d3aef8034..000000000 --- a/advisories/GLIBC-SA-2023-0003 +++ /dev/null @@ -1,15 +0,0 @@ -getaddrinfo: Potential use-after-free - -When an NSS plugin only implements the _gethostbyname2_r and -_getcanonname_r callbacks, getaddrinfo could use memory that was freed -during buffer resizing, potentially causing a crash or read or write to -arbitrary memory. - -CVE-Id: CVE-2023-4806 -Public-Date: 2023-09-12 -Fix-Commit: 973fe93a5675c42798b2161c6f29c01b0e243994 (2.39) -Fix-Commit: e09ee267c03e3150c2c9ba28625ab130705a485e (2.34-420) -Fix-Commit: e3ccb230a961b4797510e6a1f5f21fd9021853e7 (2.35-270) -Fix-Commit: a9728f798ec7f05454c95637ee6581afaa9b487d (2.36-115) -Fix-Commit: 6529a7466c935f36e9006b854d6f4e1d4876f942 (2.37-39) -Fix-Commit: 00ae4f10b504bc4564e9f22f00907093f1ab9338 (2.38-20) diff --git a/advisories/GLIBC-SA-2023-0004 b/advisories/GLIBC-SA-2023-0004 deleted file mode 100644 index 5286a7aa5..000000000 --- a/advisories/GLIBC-SA-2023-0004 +++ /dev/null @@ -1,16 +0,0 @@ -tunables: local privilege escalation through buffer overflow - -If a tunable of the form NAME=NAME=VAL is passed in the environment of a -setuid program and NAME is valid, it may result in a buffer overflow, -which could be exploited to achieve escalated privileges. This flaw was -introduced in glibc 2.34. - -CVE-Id: CVE-2023-4911 -Public-Date: 2023-10-03 -Vulnerable-Commit: 2ed18c5b534d9e92fc006202a5af0df6b72e7aca (2.34) -Fix-Commit: 1056e5b4c3f2d90ed2b4a55f96add28da2f4c8fa (2.39) -Fix-Commit: dcc367f148bc92e7f3778a125f7a416b093964d9 (2.34-423) -Fix-Commit: c84018a05aec80f5ee6f682db0da1130b0196aef (2.35-274) -Fix-Commit: 22955ad85186ee05834e47e665056148ca07699c (2.36-118) -Fix-Commit: b4e23c75aea756b4bddc4abcf27a1c6dca8b6bd3 (2.37-45) -Fix-Commit: 750a45a783906a19591fb8ff6b7841470f1f5701 (2.38-27) diff --git a/advisories/GLIBC-SA-2023-0005 b/advisories/GLIBC-SA-2023-0005 deleted file mode 100644 index cc4eb90b8..000000000 --- a/advisories/GLIBC-SA-2023-0005 +++ /dev/null @@ -1,18 +0,0 @@ -getaddrinfo: DoS due to memory leak - -The fix for CVE-2023-4806 introduced a memory leak when an application -calls getaddrinfo for AF_INET6 with AI_CANONNAME, AI_ALL and AI_V4MAPPED -flags set. - -CVE-Id: CVE-2023-5156 -Public-Date: 2023-09-25 -Vulnerable-Commit: e09ee267c03e3150c2c9ba28625ab130705a485e (2.34-420) -Vulnerable-Commit: e3ccb230a961b4797510e6a1f5f21fd9021853e7 (2.35-270) -Vulnerable-Commit: a9728f798ec7f05454c95637ee6581afaa9b487d (2.36-115) -Vulnerable-Commit: 6529a7466c935f36e9006b854d6f4e1d4876f942 (2.37-39) -Vulnerable-Commit: 00ae4f10b504bc4564e9f22f00907093f1ab9338 (2.38-20) -Fix-Commit: 8006457ab7e1cd556b919f477348a96fe88f2e49 (2.34-421) -Fix-Commit: 17092c0311f954e6f3c010f73ce3a78c24ac279a (2.35-272) -Fix-Commit: 856bac55f98dc840e7c27cfa82262b933385de90 (2.36-116) -Fix-Commit: 4473d1b87d04b25cdd0e0354814eeaa421328268 (2.37-42) -Fix-Commit: 5ee59ca371b99984232d7584fe2b1a758b4421d3 (2.38-24) diff --git a/advisories/GLIBC-SA-2024-0001 b/advisories/GLIBC-SA-2024-0001 deleted file mode 100644 index 28931c75a..000000000 --- a/advisories/GLIBC-SA-2024-0001 +++ /dev/null @@ -1,15 +0,0 @@ -syslog: Heap buffer overflow in __vsyslog_internal - -__vsyslog_internal did not handle a case where printing a SYSLOG_HEADER -containing a long program name failed to update the required buffer -size, leading to the allocation and overflow of a too-small buffer on -the heap. - -CVE-Id: CVE-2023-6246 -Public-Date: 2024-01-30 -Vulnerable-Commit: 52a5be0df411ef3ff45c10c7c308cb92993d15b1 (2.37) -Fix-Commit: 6bd0e4efcc78f3c0115e5ea9739a1642807450da (2.39) -Fix-Commit: 23514c72b780f3da097ecf33a793b7ba9c2070d2 (2.38-42) -Fix-Commit: 97a4292aa4a2642e251472b878d0ec4c46a0e59a (2.37-57) -Vulnerable-Commit: b0e7888d1fa2dbd2d9e1645ec8c796abf78880b9 (2.36-16) -Fix-Commit: d1a83b6767f68b3cb5b4b4ea2617254acd040c82 (2.36-126) diff --git a/advisories/GLIBC-SA-2024-0002 b/advisories/GLIBC-SA-2024-0002 deleted file mode 100644 index 940bfcf2f..000000000 --- a/advisories/GLIBC-SA-2024-0002 +++ /dev/null @@ -1,15 +0,0 @@ -syslog: Heap buffer overflow in __vsyslog_internal - -__vsyslog_internal used the return value of snprintf/vsnprintf to -calculate buffer sizes for memory allocation. If these functions (for -any reason) failed and returned -1, the resulting buffer would be too -small to hold output. - -CVE-Id: CVE-2023-6779 -Public-Date: 2024-01-30 -Vulnerable-Commit: 52a5be0df411ef3ff45c10c7c308cb92993d15b1 (2.37) -Fix-Commit: 7e5a0c286da33159d47d0122007aac016f3e02cd (2.39) -Fix-Commit: d0338312aace5bbfef85e03055e1212dd0e49578 (2.38-43) -Fix-Commit: 67062eccd9a65d7fda9976a56aeaaf6c25a80214 (2.37-58) -Vulnerable-Commit: b0e7888d1fa2dbd2d9e1645ec8c796abf78880b9 (2.36-16) -Fix-Commit: 2bc9d7c002bdac38b5c2a3f11b78e309d7765b83 (2.36-127) diff --git a/advisories/GLIBC-SA-2024-0003 b/advisories/GLIBC-SA-2024-0003 deleted file mode 100644 index b43a5150a..000000000 --- a/advisories/GLIBC-SA-2024-0003 +++ /dev/null @@ -1,13 +0,0 @@ -syslog: Integer overflow in __vsyslog_internal - -__vsyslog_internal calculated a buffer size by adding two integers, but -did not first check if the addition would overflow. - -CVE-Id: CVE-2023-6780 -Public-Date: 2024-01-30 -Vulnerable-Commit: 52a5be0df411ef3ff45c10c7c308cb92993d15b1 (2.37) -Fix-Commit: ddf542da94caf97ff43cc2875c88749880b7259b (2.39) -Fix-Commit: d37c2b20a4787463d192b32041c3406c2bd91de0 (2.38-44) -Fix-Commit: 2b58cba076e912961ceaa5fa58588e4b10f791c0 (2.37-59) -Vulnerable-Commit: b0e7888d1fa2dbd2d9e1645ec8c796abf78880b9 (2.36-16) -Fix-Commit: b9b7d6a27aa0632f334352fa400771115b3c69b7 (2.36-128) diff --git a/advisories/GLIBC-SA-2024-0004 b/advisories/GLIBC-SA-2024-0004 deleted file mode 100644 index 08df2b311..000000000 --- a/advisories/GLIBC-SA-2024-0004 +++ /dev/null @@ -1,28 +0,0 @@ -ISO-2022-CN-EXT: fix out-of-bound writes when writing escape sequence - -The iconv() function in the GNU C Library versions 2.39 and older may -overflow the output buffer passed to it by up to 4 bytes when converting -strings to the ISO-2022-CN-EXT character set, which may be used to -crash an application or overwrite a neighbouring variable. - -ISO-2022-CN-EXT uses escape sequences to indicate character set changes -(as specified by RFC 1922). While the SOdesignation has the expected -bounds checks, neither SS2designation nor SS3designation have its; -allowing a write overflow of 1, 2, or 3 bytes with fixed values: -'$+I', '$+J', '$+K', '$+L', '$+M', or '$*H'. - -CVE-Id: CVE-2024-2961 -Public-Date: 2024-04-17 -Vulnerable-Commit: 755104edc75c53f4a0e7440334e944ad3c6b32fc (2.1.93-169) -Fix-Commit: f9dc609e06b1136bb0408be9605ce7973a767ada (2.40) -Fix-Commit: 31da30f23cddd36db29d5b6a1c7619361b271fb4 (2.39-31) -Fix-Commit: e1135387deded5d73924f6ca20c72a35dc8e1bda (2.38-66) -Fix-Commit: 89ce64b269a897a7780e4c73a7412016381c6ecf (2.37-89) -Fix-Commit: 4ed98540a7fd19f458287e783ae59c41e64df7b5 (2.36-164) -Fix-Commit: 36280d1ce5e245aabefb877fe4d3c6cff95dabfa (2.35-315) -Fix-Commit: a8b0561db4b9847ebfbfec20075697d5492a363c (2.34-459) -Fix-Commit: ed4f16ff6bed3037266f1fa682ebd32a18fce29c (2.33-263) -Fix-Commit: 682ad4c8623e611a971839990ceef00346289cc9 (2.32-140) -Fix-Commit: 3703c32a8d304c1ee12126134ce69be965f38000 (2.31-154) - -Reported-By: Charles Fol diff --git a/advisories/GLIBC-SA-2024-0005 b/advisories/GLIBC-SA-2024-0005 deleted file mode 100644 index a59596610..000000000 --- a/advisories/GLIBC-SA-2024-0005 +++ /dev/null @@ -1,22 +0,0 @@ -nscd: Stack-based buffer overflow in netgroup cache - -If the Name Service Cache Daemon's (nscd) fixed size cache is exhausted -by client requests then a subsequent client request for netgroup data -may result in a stack-based buffer overflow. This flaw was introduced -in glibc 2.15 when the cache was added to nscd. - -This vulnerability is only present in the nscd binary. - -CVE-Id: CVE-2024-33599 -Public-Date: 2024-04-23 -Vulnerable-Commit: 684ae515993269277448150a1ca70db3b94aa5bd (2.15) -Fix-Commit: 69c58d5ef9f584ea198bd00f7964d364d0e6b921 (2.31-155) -Fix-Commit: a77064893bfe8a701770e2f53a4d33805bc47a5a (2.32-141) -Fix-Commit: 5c75001a96abcd50cbdb74df24c3f013188d076e (2.33-264) -Fix-Commit: 52f73e5c4e29b14e79167272297977f360ae1e97 (2.34-460) -Fix-Commit: 7a95873543ce225376faf13bb71c43dea6d24f86 (2.35-316) -Fix-Commit: caa3151ca460bdd9330adeedd68c3112d97bffe4 (2.36-165) -Fix-Commit: f75c298e747b2b8b41b1c2f551c011a52c41bfd1 (2.37-91) -Fix-Commit: 5968aebb86164034b8f8421b4abab2f837a5bdaf (2.38-72) -Fix-Commit: 1263d583d2e28afb8be53f8d6922f0842036f35d (2.39-35) -Fix-Commit: 87801a8fd06db1d654eea3e4f7626ff476a9bdaa (2.40) diff --git a/advisories/GLIBC-SA-2024-0006 b/advisories/GLIBC-SA-2024-0006 deleted file mode 100644 index d44148d3d..000000000 --- a/advisories/GLIBC-SA-2024-0006 +++ /dev/null @@ -1,32 +0,0 @@ -nscd: Null pointer crash after notfound response - -If the Name Service Cache Daemon's (nscd) cache fails to add a not-found -netgroup response to the cache, the client request can result in a null -pointer dereference. This flaw was introduced in glibc 2.15 when the -cache was added to nscd. - -This vulnerability is only present in the nscd binary. - -CVE-Id: CVE-2024-33600 -Public-Date: 2024-04-24 -Vulnerable-Commit: 684ae515993269277448150a1ca70db3b94aa5bd (2.15) -Fix-Commit: b048a482f088e53144d26a61c390bed0210f49f2 (2.40) -Fix-Commit: 7835b00dbce53c3c87bbbb1754a95fb5e58187aa (2.40) -Fix-Commit: c99f886de54446cd4447db6b44be93dabbdc2f8b (2.39-37) -Fix-Commit: 5a508e0b508c8ad53bd0d2fb48fd71b242626341 (2.39-36) -Fix-Commit: 2ae9446c1b7a3064743b4a51c0bbae668ee43e4c (2.38-74) -Fix-Commit: 541ea5172aa658c4bd5c6c6d6fd13903c3d5bb0a (2.38-73) -Fix-Commit: a8070b31043c7585c36ba68a74298c4f7af075c3 (2.37-93) -Fix-Commit: 5eea50c4402e39588de98aa1d4469a79774703d4 (2.37-92) -Fix-Commit: f205b3af56740e3b014915b1bd3b162afe3407ef (2.36-167) -Fix-Commit: c34f470a615b136170abd16142da5dd0c024f7d1 (2.36-166) -Fix-Commit: bafadc589fbe21ae330e8c2af74db9da44a17660 (2.35-318) -Fix-Commit: 4370bef52b0f3f3652c6aa13d7a9bb3ac079746d (2.35-317) -Fix-Commit: 1f94122289a9bf7dba573f5d60327aaa2b85cf2e (2.34-462) -Fix-Commit: 966d6ac9e40222b84bb21674cc4f83c8d72a5a26 (2.34-461) -Fix-Commit: e3eef1b8fbdd3a7917af466ca9c4b7477251ca79 (2.33-266) -Fix-Commit: f20a8d696b13c6261b52a6434899121f8b19d5a7 (2.33-265) -Fix-Commit: be602180146de37582a3da3a0caa4b719645de9c (2.32-143) -Fix-Commit: 394eae338199078b7961b051c191539870742d7b (2.32-142) -Fix-Commit: 8d7949183760170c61e55def723c1d8050187874 (2.31-157) -Fix-Commit: 304ce5fe466c4762b21b36c26926a4657b59b53e (2.31-156) diff --git a/advisories/GLIBC-SA-2024-0007 b/advisories/GLIBC-SA-2024-0007 deleted file mode 100644 index b6928fa27..000000000 --- a/advisories/GLIBC-SA-2024-0007 +++ /dev/null @@ -1,28 +0,0 @@ -nscd: netgroup cache may terminate daemon on memory allocation failure - -The Name Service Cache Daemon's (nscd) netgroup cache uses xmalloc or -xrealloc and these functions may terminate the process due to a memory -allocation failure resulting in a denial of service to the clients. The -flaw was introduced in glibc 2.15 when the cache was added to nscd. - -This vulnerability is only present in the nscd binary. - -Subsequent refactoring of the netgroup cache only added more uses of -xmalloc and xrealloc. Uses of xmalloc and xrealloc in other parts of -nscd only occur during startup of the daemon and so are not affected by -client requests that could trigger an out of memory followed by -termination. - -CVE-Id: CVE-2024-33601 -Public-Date: 2024-04-24 -Vulnerable-Commit: 684ae515993269277448150a1ca70db3b94aa5bd (2.15) -Fix-Commit: c04a21e050d64a1193a6daab872bca2528bda44b (2.40) -Fix-Commit: a9a8d3eebb145779a18d90e3966009a1daa63cd8 (2.39-38) -Fix-Commit: 71af8ca864345d39b746d5cee84b94b430fad5db (2.38-75) -Fix-Commit: 6e106dc214d6a033a4e945d1c6cf58061f1c5f1f (2.37-94) -Fix-Commit: b6742463694b1dfdd5120b91ee21cf05d15ec2e2 (2.36-168) -Fix-Commit: 7a5864cac60e06000394128a5a2817b03542f5a3 (2.35-319) -Fix-Commit: 86f1d5f4129c373ac6fb6df5bcf38273838843cb (2.34-463) -Fix-Commit: 4d27d4b9a188786fc6a56745506cec2acfc51f83 (2.33-267) -Fix-Commit: 3ed195a8ec89da281e3c4bf887a13d281b72d8f4 (2.32-144) -Fix-Commit: bbf5a58ccb55679217f94de706164d15372fbbc0 (2.31-158) diff --git a/advisories/GLIBC-SA-2024-0008 b/advisories/GLIBC-SA-2024-0008 deleted file mode 100644 index d93e2a6f0..000000000 --- a/advisories/GLIBC-SA-2024-0008 +++ /dev/null @@ -1,26 +0,0 @@ -nscd: netgroup cache assumes NSS callback uses in-buffer strings - -The Name Service Cache Daemon's (nscd) netgroup cache can corrupt memory -when the NSS callback does not store all strings in the provided buffer. -The flaw was introduced in glibc 2.15 when the cache was added to nscd. - -This vulnerability is only present in the nscd binary. - -There is no guarantee from the NSS callback API that the returned -strings are all within the buffer. However, the netgroup cache code -assumes that the NSS callback uses in-buffer strings and if it doesn't -the buffer resizing logic could lead to potential memory corruption. - -CVE-Id: CVE-2024-33602 -Public-Date: 2024-04-24 -Vulnerable-Commit: 684ae515993269277448150a1ca70db3b94aa5bd (2.15) -Fix-Commit: c04a21e050d64a1193a6daab872bca2528bda44b (2.40) -Fix-Commit: a9a8d3eebb145779a18d90e3966009a1daa63cd8 (2.39-38) -Fix-Commit: 71af8ca864345d39b746d5cee84b94b430fad5db (2.38-75) -Fix-Commit: 6e106dc214d6a033a4e945d1c6cf58061f1c5f1f (2.37-94) -Fix-Commit: b6742463694b1dfdd5120b91ee21cf05d15ec2e2 (2.36-168) -Fix-Commit: 7a5864cac60e06000394128a5a2817b03542f5a3 (2.35-319) -Fix-Commit: 86f1d5f4129c373ac6fb6df5bcf38273838843cb (2.34-463) -Fix-Commit: 4d27d4b9a188786fc6a56745506cec2acfc51f83 (2.33-267) -Fix-Commit: 3ed195a8ec89da281e3c4bf887a13d281b72d8f4 (2.32-144) -Fix-Commit: bbf5a58ccb55679217f94de706164d15372fbbc0 (2.31-158) diff --git a/advisories/GLIBC-SA-2025-0001 b/advisories/GLIBC-SA-2025-0001 deleted file mode 100644 index 45f8b8f18..000000000 --- a/advisories/GLIBC-SA-2025-0001 +++ /dev/null @@ -1,25 +0,0 @@ -assert: Buffer overflow when printing assertion failure message - -When the assert() function fails, it does not allocate enough space for the -assertion failure message string and size information, which may lead to a -buffer overflow if the message string size aligns to page size. - -This bug can be triggered when an assertion in a program fails. The assertion -failure message is allocated to allow developers to see this failure in core -dumps and it typically includes, in addition to the invariant assertion -string and function name, the name of the program. If the name of the failing -program is user controlled, for example on a local system, this could allow an -attacker to control the assertion failure to trigger this buffer overflow. - -The only viable vector for exploitation of this bug is local, if a setuid -program exists that has an existing bug that results in an assertion failure. -No such program has been discovered at the time of publishing this advisory, -but the presence of custom setuid programs, although strongly discouraged as a -security practice, cannot be discounted. - -CVE-Id: CVE-2025-0395 -Public-Date: 2025-01-22 -Vulnerable-Commit: f8a3b5bf8fa1d0c43d2458e03cc109a04fdef194 (2.13-175) -Fix-Commit: 68ee0f704cb81e9ad0a78c644a83e1e9cd2ee578 (2.41) -Fix-Commit: 7d4b6bcae91f29d7b4daf15bab06b66cf1d2217c (2.40-66) -Reported-By: Qualys Security Advisory diff --git a/advisories/README b/advisories/README deleted file mode 100644 index b8f8a829c..000000000 --- a/advisories/README +++ /dev/null @@ -1,77 +0,0 @@ -GNU C Library Security Advisory Format -====================================== - -Security advisories in this directory follow a simple git commit log -format, with a heading and free-format description augmented with tags -to allow parsing key information. References to code changes are -specific to the glibc repository and follow a specific format: - - Tag-name: (release-version) - -The indicates a specific commit in the repository. The -release-version indicates the publicly consumable release in which this -commit is known to exist. The release-version is derived from the -git-describe format, (i.e. stripped out from glibc-2.34.NNN-gxxxx) and -is of the form 2.34-NNN. If the -NNN suffix is absent, it means that -the change is in that release tarball, otherwise the change is on the -release/2.YY/master branch and not in any released tarball. - -The following tags are currently being used: - -CVE-Id: -This is the CVE-Id assigned under the CVE Program -(https://www.cve.org/). - -Public-Date: -The date this issue became publicly known. - -Vulnerable-Commit: -The commit that introduced this vulnerability. There could be multiple -entries, one for each release branch in the glibc repository; the -release-version portion of this tag should tell you which branch this is -on. - -Fix-Commit: -The commit that fixed this vulnerability. There could be multiple -entries for each release branch in the glibc repository, indicating that -all of those commits contributed to fixing that issue in each of those -branches. - -Reported-By: -The entity that reported this issue. There could be multiple entries, one for -each reporter. - -Adding an Advisory ------------------- - -An advisory for a CVE needs to be added on the master branch in two steps: - -1. Add the text of the advisory without any Fix-Commit tags along with - the fix for the CVE. Add the Vulnerable-Commit tag, if applicable. - The advisories directory does not exist in release branches, so keep - the advisory text commit distinct from the code changes, to ease - backports. Ask for the GLIBC-SA advisory number from the security - team. - -2. Finish all backports on release branches and then back on the msater - branch, add all commit refs to the advisory using the Fix-Commit - tags. Don't bother adding the release-version subscript since the - next step will overwrite it. - -3. Run the process-advisories.sh script in the scripts directory on the - advisory: - - scripts/process-advisories.sh update GLIBC-SA-YYYY-NNNN - - (replace YYYY-NNNN with the actual advisory number). - -4. Verify the updated advisory and push the result. - -Getting a NEWS snippet from advisories --------------------------------------- - -Run: - - scripts/process-advisories.sh news - -and copy the content into the NEWS file. diff --git a/assert/Makefile b/assert/Makefile index 65b9d0768..8d106d875 100644 --- a/assert/Makefile +++ b/assert/Makefile @@ -39,6 +39,7 @@ tests := \ test-assert-perr \ tst-assert-c++ \ tst-assert-g++ \ + tst-assert-sa-2025-0001 \ # tests ifeq ($(have-cxx-thread_local),yes) diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c new file mode 100644 index 000000000..102cb0078 --- /dev/null +++ b/assert/tst-assert-sa-2025-0001.c @@ -0,0 +1,92 @@ +/* Test for CVE-2025-0395. + Copyright The GNU Toolchain Authors. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Test that a large enough __progname does not result in a buffer overflow + when printing an assertion failure. This was CVE-2025-0395. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern const char *__progname; + +int +do_test (int argc, char **argv) +{ + + support_need_proc ("Reads /proc/self/maps to add guards to writable maps."); + ignore_stderr (); + + /* XXX assumes that the assert is on a 2 digit line number. */ + const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n"; + + int ret = fprintf (stderr, prompt, __FILE__); + if (ret < 0) + FAIL_EXIT1 ("fprintf failed: %m\n"); + + size_t pagesize = getpagesize (); + size_t namesize = pagesize - 1 - ret; + + /* Alter the progname so that the assert message fills the entire page. */ + char progname[namesize]; + memset (progname, 'A', namesize - 1); + progname[namesize - 1] = '\0'; + __progname = progname; + + FILE *f = xfopen ("/proc/self/maps", "r"); + char *line = NULL; + size_t len = 0; + uintptr_t prev_to = 0; + + /* Pad the beginning of every writable mapping with a PROT_NONE map. This + ensures that the mmap in the assert_fail path never ends up below a + writable map and will terminate immediately in case of a buffer + overflow. */ + while (xgetline (&line, &len, f)) + { + uintptr_t from, to; + char perm[4]; + + sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ", + &from, &to, + &perm[0], &perm[1], &perm[2], &perm[3]); + + bool writable = (memchr (perm, 'w', 4) != NULL); + + if (prev_to != 0 && from - prev_to > pagesize && writable) + xmmap ((void *) from - pagesize, pagesize, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE, 0); + + prev_to = to; + } + + xfclose (f); + + assert (argc < 1); + return 0; +} + +#define EXPECTED_SIGNAL SIGABRT +#define TEST_FUNCTION_ARGV do_test +#include diff --git a/benchtests/atanh-inputs b/benchtests/atanh-inputs index 455aa65b6..498529325 100644 --- a/benchtests/atanh-inputs +++ b/benchtests/atanh-inputs @@ -1,6 +1,7 @@ ## args: double ## ret: double ## includes: math.h +## name: workload-random 0x1.5a2730bacd94ap-1 -0x1.b57eb40fc048ep-21 -0x1.c0b185fb450e2p-17 diff --git a/benchtests/sinh-inputs b/benchtests/sinh-inputs index 7b1ac46a3..2fcb2fabf 100644 --- a/benchtests/sinh-inputs +++ b/benchtests/sinh-inputs @@ -1,6 +1,7 @@ ## args: double ## ret: double ## includes: math.h +## name: workload-random 0x1.bcb6129b5ff2bp8 -0x1.63057386325ebp9 0x1.62f1d7dc4e8bfp9 diff --git a/config.make.in b/config.make.in index 36096881b..59897eaec 100644 --- a/config.make.in +++ b/config.make.in @@ -53,6 +53,7 @@ c++-bits-std_abs-h = @CXX_BITS_STD_ABS_H@ enable-werror = @enable_werror@ have-z-execstack = @libc_cv_z_execstack@ +have-no-error-execstack = @libc_cv_no_error_execstack@ have-protected-data = @libc_cv_protected_data@ have-insert = @libc_cv_insert@ have-glob-dat-reloc = @libc_cv_has_glob_dat@ diff --git a/configure b/configure index eb8abd005..674d1d7e4 100755 --- a/configure +++ b/configure @@ -659,6 +659,7 @@ libc_cv_has_glob_dat libc_cv_fpie libc_cv_test_static_pie libc_cv_z_execstack +libc_cv_no_error_execstack ASFLAGS_config libc_cv_cc_with_libunwind libc_cv_insert @@ -7114,6 +7115,40 @@ if test $libc_cv_as_noexecstack = yes; then fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for linker that supports --no-error-execstack" >&5 +printf %s "checking for linker that supports --no-error-execstack... " >&6; } +libc_linker_feature=no +cat > conftest.c <&5 + (eval $ac_try) 2>&5 + ac_status=$? + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } +then + if ${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS $no_ssp -Wl,--no-error-execstack -nostdlib \ + -nostartfiles -fPIC -shared -o conftest.so conftest.c 2>&1 \ + | grep "warning: --no-error-execstack ignored" > /dev/null 2>&1; then + true + else + libc_linker_feature=yes + fi +fi +rm -f conftest* +if test $libc_linker_feature = yes; then + libc_cv_no_error_execstack=yes +else + libc_cv_no_error_execstack=no +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_linker_feature" >&5 +printf "%s\n" "$libc_linker_feature" >&6; } + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for linker that supports -z execstack" >&5 printf %s "checking for linker that supports -z execstack... " >&6; } libc_linker_feature=no @@ -8643,6 +8678,35 @@ if test $libc_cv_builtin_trap = yes; then fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler supports __attribute__ ((aligned (65536)))" >&5 +printf %s "checking whether the compiler supports __attribute__ ((aligned (65536)))... " >&6; } +if test ${libc_cv_aligned_65536+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +char bss0xb5dce8 __attribute__ ((aligned (65536))); + +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + libc_cv_aligned_65536=yes +else case e in #( + e) libc_cv_aligned_65536=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_aligned_65536" >&5 +printf "%s\n" "$libc_cv_aligned_65536" >&6; } +config_vars="$config_vars +aligned-65536 = $libc_cv_aligned_65536" + ac_ext=cpp ac_cpp='$CXXCPP $CPPFLAGS' ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' @@ -8908,6 +8972,104 @@ printf "%s\n" "$libc_linker_feature" >&6; } config_vars="$config_vars load-address-ldflag = $libc_cv_load_address_ldflag" +# Check if compilers support GCS in branch protection: + +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking if compiler supports -mbranch-protection=gcs" >&5 +printf %s "checking if compiler supports -mbranch-protection=gcs... " >&6; } +if test ${libc_cv_cc_gcs+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) if { ac_try='${CC-cc} -Werror -mbranch-protection=gcs -xc /dev/null -S -o /dev/null' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } +then : + libc_cv_cc_gcs=yes +else case e in #( + e) libc_cv_cc_gcs=no ;; +esac +fi ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_gcs" >&5 +printf "%s\n" "$libc_cv_cc_gcs" >&6; } +if test "$TEST_CC" = "$CC"; then + libc_cv_test_cc_gcs=$libc_cv_cc_gcs +else + +saved_CC="$CC" +CC="$TEST_CC" +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking if compiler supports -mbranch-protection=gcs in testing" >&5 +printf %s "checking if compiler supports -mbranch-protection=gcs in testing... " >&6; } +if test ${libc_cv_test_cc_gcs+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) if { ac_try='${CC-cc} -Werror -mbranch-protection=gcs -xc /dev/null -S -o /dev/null' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } +then : + libc_cv_test_cc_gcs=yes +else case e in #( + e) libc_cv_test_cc_gcs=no ;; +esac +fi ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_test_cc_gcs" >&5 +printf "%s\n" "$libc_cv_test_cc_gcs" >&6; } + +CC="$saved_CC" + +fi + +config_vars="$config_vars +have-cc-gcs = $libc_cv_cc_gcs" +config_vars="$config_vars +have-test-cc-gcs = $libc_cv_test_cc_gcs" + +# Check if linker supports GCS marking +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for linker that supports -z gcs=always" >&5 +printf %s "checking for linker that supports -z gcs=always... " >&6; } +libc_linker_feature=no +cat > conftest.c <&5 + (eval $ac_try) 2>&5 + ac_status=$? + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } +then + if ${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS $no_ssp -Wl,-z,gcs=always -nostdlib \ + -nostartfiles -fPIC -shared -o conftest.so conftest.c 2>&1 \ + | grep "warning: -z gcs=always ignored" > /dev/null 2>&1; then + true + else + libc_linker_feature=yes + fi +fi +rm -f conftest* +if test $libc_linker_feature = yes; then + libc_cv_ld_gcs=yes +else + libc_cv_ld_gcs=no +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_linker_feature" >&5 +printf "%s\n" "$libc_linker_feature" >&6; } +config_vars="$config_vars +have-ld-gcs = $libc_cv_ld_gcs" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking if we can build programs as PIE" >&5 printf %s "checking if we can build programs as PIE... " >&6; } cat confdefs.h - <<_ACEOF >conftest.$ac_ext diff --git a/configure.ac b/configure.ac index 050bfa65e..57cd24c87 100644 --- a/configure.ac +++ b/configure.ac @@ -1318,6 +1318,10 @@ if test $libc_cv_as_noexecstack = yes; then fi AC_SUBST(ASFLAGS_config) +LIBC_LINKER_FEATURE([--no-error-execstack], [-Wl,--no-error-execstack], + [libc_cv_no_error_execstack=yes], [libc_cv_no_error_execstack=no]) +AC_SUBST(libc_cv_no_error_execstack) + LIBC_LINKER_FEATURE([-z execstack], [-Wl,-z,execstack], [libc_cv_z_execstack=yes], [libc_cv_z_execstack=no]) AC_SUBST(libc_cv_z_execstack) @@ -1820,6 +1824,17 @@ if test $libc_cv_builtin_trap = yes; then AC_DEFINE([HAVE_BUILTIN_TRAP]) fi +dnl Check if +AC_CACHE_CHECK([whether the compiler supports __attribute__ ((aligned (65536)))], + libc_cv_aligned_65536, [ +AC_COMPILE_IFELSE([AC_LANG_SOURCE([ +char bss[0xb5dce8] __attribute__ ((aligned (65536))); +])], + [libc_cv_aligned_65536=yes], + [libc_cv_aligned_65536=no]) +]) +LIBC_CONFIG_VAR([aligned-65536], [$libc_cv_aligned_65536]) + dnl C++ feature tests. AC_LANG_PUSH([C++]) @@ -1992,6 +2007,23 @@ LIBC_LINKER_FEATURE([-Ttext-segment=$libc_cv_pde_load_address], [libc_cv_load_address_ldflag=]) LIBC_CONFIG_VAR([load-address-ldflag], [$libc_cv_load_address_ldflag]) +# Check if compilers support GCS in branch protection: +LIBC_TRY_CC_AND_TEST_CC_OPTION([if compiler supports -mbranch-protection=gcs], + [-Werror -mbranch-protection=gcs], + libc_cv_cc_gcs, + [libc_cv_cc_gcs=yes], + [libc_cv_cc_gcs=no], + libc_cv_test_cc_gcs, + [libc_cv_test_cc_gcs=yes], + [libc_cv_test_cc_gcs=no]) +LIBC_CONFIG_VAR([have-cc-gcs], [$libc_cv_cc_gcs]) +LIBC_CONFIG_VAR([have-test-cc-gcs], [$libc_cv_test_cc_gcs]) + +# Check if linker supports GCS marking +LIBC_LINKER_FEATURE([-z gcs=always], [-Wl,-z,gcs=always], + [libc_cv_ld_gcs=yes], [libc_cv_ld_gcs=no]) +LIBC_CONFIG_VAR([have-ld-gcs], [$libc_cv_ld_gcs]) + AC_MSG_CHECKING(if we can build programs as PIE) AC_COMPILE_IFELSE([AC_LANG_SOURCE([[#ifdef PIE_UNSUPPORTED # error PIE is not supported diff --git a/elf/Makefile b/elf/Makefile index 4b1d0d874..b8064ef14 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -34,7 +34,6 @@ routines = \ dl-addr \ dl-addr-obj \ dl-early_allocate \ - dl-find_object \ dl-iteratephdr \ dl-libc \ dl-origin \ @@ -61,6 +60,8 @@ dl-routines = \ dl-deps \ dl-exception \ dl-execstack \ + dl-execstack-tunable \ + dl-find_object \ dl-fini \ dl-init \ dl-load \ @@ -266,6 +267,7 @@ tests-static-normal := \ tst-array1-static \ tst-array5-static \ tst-dl-iter-static \ + tst-dlopen-sgid \ tst-dst-static \ tst-env-setuid-static \ tst-getauxval-static \ @@ -379,6 +381,7 @@ tests += \ tst-align3 \ tst-audit-tlsdesc \ tst-audit-tlsdesc-dlopen \ + tst-audit-tlsdesc-dlopen2 \ tst-audit1 \ tst-audit2 \ tst-audit8 \ @@ -532,6 +535,8 @@ tests-internal += \ tst-dl_find_object-threads \ tst-dlmopen2 \ tst-hash-collision3 \ + tst-link-map-contiguous-ldso \ + tst-link-map-contiguous-libc \ tst-ptrguard1 \ tst-stackguard1 \ tst-tls-surplus \ @@ -543,6 +548,10 @@ tests-internal += \ unload2 \ # tests-internal +ifeq ($(build-hardcoded-path-in-tests),yes) +tests-internal += tst-link-map-contiguous-main +endif + tests-container += \ tst-dlopen-self-container \ tst-dlopen-tlsmodid-container \ @@ -567,9 +576,11 @@ tests-execstack-yes = \ tst-execstack \ tst-execstack-needed \ tst-execstack-prog \ + tst-execstack-tunable \ # tests-execstack-yes tests-execstack-static-yes = \ - tst-execstack-prog-static + tst-execstack-prog-static \ + tst-execstack-prog-static-tunable \ # tests-execstack-static-yes ifeq (yes,$(run-built-tests)) tests-execstack-special-yes = \ @@ -863,6 +874,7 @@ modules-names += \ tst-auditmanymod8 \ tst-auditmanymod9 \ tst-auditmod-tlsdesc \ + tst-auditmod-tlsdesc2 \ tst-auditmod1 \ tst-auditmod11 \ tst-auditmod12 \ @@ -905,6 +917,7 @@ modules-names += \ tst-dlmopen1mod \ tst-dlopen-auditdup-auditmod \ tst-dlopen-auditdupmod \ + tst-dlopen-sgid-mod \ tst-dlopen-tlsreinitmod1 \ tst-dlopen-tlsreinitmod2 \ tst-dlopen-tlsreinitmod3 \ @@ -1144,6 +1157,10 @@ tests-pie += \ tst-pie1 \ tst-pie2 \ # tests-pie +ifeq (yes,$(aligned-65536)) +tests += tst-pie-bss +tests-pie += tst-pie-bss +endif ifneq (,$(load-address-ldflag)) tests += \ tst-pie-address \ @@ -1159,6 +1176,10 @@ tests += \ tests-static += \ tst-pie-address-static \ # tests-static +ifeq (yes,$(aligned-65536)) +tests += tst-pie-bss-static +tests-static += tst-pie-bss-static +endif LDFLAGS-tst-pie-address-static += \ $(load-address-ldflag)=$(pde-load-address) endif @@ -1988,6 +2009,9 @@ $(objpfx)tst-execstack.out: $(objpfx)tst-execstack-mod.so CPPFLAGS-tst-execstack.c += -DUSE_PTHREADS=0 LDFLAGS-tst-execstack = -Wl,-z,noexecstack LDFLAGS-tst-execstack-mod.so = -Wl,-z,execstack +ifeq ($(have-no-error-execstack),yes) +LDFLAGS-tst-execstack-mod.so += -Wl,--no-error-execstack +endif $(objpfx)tst-execstack-needed: $(objpfx)tst-execstack-mod.so LDFLAGS-tst-execstack-needed = -Wl,-z,noexecstack @@ -1996,7 +2020,18 @@ LDFLAGS-tst-execstack-prog = -Wl,-z,execstack CFLAGS-tst-execstack-prog.c += -Wno-trampolines CFLAGS-tst-execstack-mod.c += -Wno-trampolines +# It expects loading a module with executable stack to work. +CFLAGS-tst-execstack-tunable.c += -DUSE_PTHREADS=0 -DDEFAULT_RWX_STACK=1 +$(objpfx)tst-execstack-tunable.out: $(objpfx)tst-execstack-mod.so +tst-execstack-tunable-ENV = GLIBC_TUNABLES=glibc.rtld.execstack=2 + +LDFLAGS-tst-execstack-prog-static-tunable = -Wl,-z,noexecstack +tst-execstack-prog-static-tunable-ENV = GLIBC_TUNABLES=glibc.rtld.execstack=2 + LDFLAGS-tst-execstack-prog-static = -Wl,-z,execstack +ifeq ($(have-no-error-execstack),yes) +LDFLAGS-tst-execstack-prog-static += -Wl,--no-error-execstack +endif CFLAGS-tst-execstack-prog-static.c += -Wno-trampolines ifeq (yes,$(build-hardcoded-path-in-tests)) @@ -2074,6 +2109,7 @@ $(objpfx)tst-array5-static-cmp.out: tst-array5-static.exp \ CFLAGS-tst-pie1.c += $(pie-ccflag) CFLAGS-tst-pie2.c += $(pie-ccflag) +CFLAGS-tst-pie-bss.c += $(pie-ccflag) CFLAGS-tst-pie-address.c += $(pie-ccflag) $(objpfx)tst-piemod1.so: $(libsupport) @@ -3189,6 +3225,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so +$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \ + $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules)) +tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so $(objpfx)tst-dlmopen-twice.out: \ $(objpfx)tst-dlmopen-twice-mod1.so \ @@ -3392,3 +3431,5 @@ $(objpfx)tst-nolink-libc-2: $(objpfx)tst-nolink-libc.o -Wl,--dynamic-linker=$(objpfx)ld.so $(objpfx)tst-nolink-libc-2.out: $(objpfx)tst-nolink-libc-2 $(objpfx)ld.so $< > $@ 2>&1; $(evaluate-test) + +$(objpfx)tst-dlopen-sgid.out: $(objpfx)tst-dlopen-sgid-mod.so diff --git a/elf/dl-execstack-tunable.c b/elf/dl-execstack-tunable.c new file mode 100644 index 000000000..e3b638aea --- /dev/null +++ b/elf/dl-execstack-tunable.c @@ -0,0 +1,39 @@ +/* Stack executability handling for GNU dynamic linker. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +void +_dl_handle_execstack_tunable (void) +{ + switch (TUNABLE_GET (glibc, rtld, execstack, int32_t, NULL)) + { + case stack_tunable_mode_disable: + if ((__glibc_unlikely (GL(dl_stack_flags)) & PF_X)) + _dl_fatal_printf ( +"Fatal glibc error: executable stack is not allowed\n"); + break; + + case stack_tunable_mode_force: + if (_dl_make_stack_executable (__libc_stack_end) != 0) + _dl_fatal_printf ( +"Fatal glibc error: cannot enable executable stack as tunable requires"); + break; + } +} diff --git a/elf/dl-execstack.c b/elf/dl-execstack.c index e4d7dbe7f..ceec5b2de 100644 --- a/elf/dl-execstack.c +++ b/elf/dl-execstack.c @@ -23,7 +23,7 @@ so as to mprotect it. */ int -_dl_make_stack_executable (void **stack_endp) +_dl_make_stack_executable (const void *stack_endp) { return ENOSYS; } diff --git a/elf/dl-find_object.c b/elf/dl-find_object.c index 513e46401..c9f4c1c8d 100644 --- a/elf/dl-find_object.c +++ b/elf/dl-find_object.c @@ -356,7 +356,7 @@ _dlfo_lookup (uintptr_t pc, struct dl_find_object_internal *first1, size_t size) } int -__dl_find_object (void *pc1, struct dl_find_object *result) +_dl_find_object (void *pc1, struct dl_find_object *result) { uintptr_t pc = (uintptr_t) pc1; @@ -463,8 +463,38 @@ __dl_find_object (void *pc1, struct dl_find_object *result) return -1; } /* Transaction retry loop. */ } -hidden_def (__dl_find_object) -weak_alias (__dl_find_object, _dl_find_object) +rtld_hidden_def (_dl_find_object) + +/* Subroutine of _dlfo_process_initial to split out noncontigous link + maps. NODELETE is the number of used _dlfo_nodelete_mappings + elements. It is incremented as needed, and the new NODELETE value + is returned. */ +static size_t +_dlfo_process_initial_noncontiguous_map (struct link_map *map, + size_t nodelete) +{ + struct dl_find_object_internal dlfo; + _dl_find_object_from_map (map, &dlfo); + + /* PT_LOAD segments for a non-contiguous link map are added to the + non-closeable mappings. */ + const ElfW(Phdr) *ph = map->l_phdr; + const ElfW(Phdr) *ph_end = map->l_phdr + map->l_phnum; + for (; ph < ph_end; ++ph) + if (ph->p_type == PT_LOAD) + { + if (_dlfo_nodelete_mappings != NULL) + { + /* Second pass only. */ + _dlfo_nodelete_mappings[nodelete] = dlfo; + ElfW(Addr) start = ph->p_vaddr + map->l_addr; + _dlfo_nodelete_mappings[nodelete].map_start = start; + _dlfo_nodelete_mappings[nodelete].map_end = start + ph->p_memsz; + } + ++nodelete; + } + return nodelete; +} /* _dlfo_process_initial is called twice. First to compute the array sizes from the initial loaded mappings. Second to fill in the @@ -477,29 +507,8 @@ _dlfo_process_initial (void) size_t nodelete = 0; if (!main_map->l_contiguous) - { - struct dl_find_object_internal dlfo; - _dl_find_object_from_map (main_map, &dlfo); - - /* PT_LOAD segments for a non-contiguous are added to the - non-closeable mappings. */ - for (const ElfW(Phdr) *ph = main_map->l_phdr, - *ph_end = main_map->l_phdr + main_map->l_phnum; - ph < ph_end; ++ph) - if (ph->p_type == PT_LOAD) - { - if (_dlfo_nodelete_mappings != NULL) - { - /* Second pass only. */ - _dlfo_nodelete_mappings[nodelete] = dlfo; - _dlfo_nodelete_mappings[nodelete].map_start - = ph->p_vaddr + main_map->l_addr; - _dlfo_nodelete_mappings[nodelete].map_end - = _dlfo_nodelete_mappings[nodelete].map_start + ph->p_memsz; - } - ++nodelete; - } - } + /* Contiguous case already handled in _dl_find_object_init. */ + nodelete = _dlfo_process_initial_noncontiguous_map (main_map, nodelete); size_t loaded = 0; for (Lmid_t ns = 0; ns < GL(dl_nns); ++ns) @@ -511,11 +520,18 @@ _dlfo_process_initial (void) /* lt_library link maps are implicitly NODELETE. */ if (l->l_type == lt_library || l->l_nodelete_active) { - if (_dlfo_nodelete_mappings != NULL) - /* Second pass only. */ - _dl_find_object_from_map - (l, _dlfo_nodelete_mappings + nodelete); - ++nodelete; + /* The kernel may have loaded ld.so with gaps. */ + if (!l->l_contiguous && is_rtld_link_map (l)) + nodelete + = _dlfo_process_initial_noncontiguous_map (l, nodelete); + else + { + if (_dlfo_nodelete_mappings != NULL) + /* Second pass only. */ + _dl_find_object_from_map + (l, _dlfo_nodelete_mappings + nodelete); + ++nodelete; + } } else if (l->l_type == lt_loaded) { @@ -765,7 +781,6 @@ _dl_find_object_update_1 (struct link_map **loaded, size_t count) /* Prefer newly loaded link map. */ assert (loaded_index1 > 0); _dl_find_object_from_map (loaded[loaded_index1 - 1], dlfo); - loaded[loaded_index1 - 1]->l_find_object_processed = 1; --loaded_index1; } diff --git a/elf/dl-find_object.h b/elf/dl-find_object.h index e433ff874..563af3de1 100644 --- a/elf/dl-find_object.h +++ b/elf/dl-find_object.h @@ -87,7 +87,7 @@ _dl_find_object_to_external (struct dl_find_object_internal *internal, } /* Extract the object location data from a link map and writes it to - *RESULT using relaxed MO stores. */ + *RESULT using relaxed MO stores. Set L->l_find_object_processed. */ static void __attribute__ ((unused)) _dl_find_object_from_map (struct link_map *l, struct dl_find_object_internal *result) @@ -100,6 +100,8 @@ _dl_find_object_from_map (struct link_map *l, atomic_store_relaxed (&result->eh_dbase, (void *) l->l_info[DT_PLTGOT]); #endif + l->l_find_object_processed = 1; + for (const ElfW(Phdr) *ph = l->l_phdr, *ph_end = l->l_phdr + l->l_phnum; ph < ph_end; ++ph) if (ph->p_type == DLFO_EH_SEGMENT_TYPE) diff --git a/elf/dl-load.c b/elf/dl-load.c index f905578a6..945dd8a23 100644 --- a/elf/dl-load.c +++ b/elf/dl-load.c @@ -945,7 +945,7 @@ struct link_map * _dl_map_object_from_fd (const char *name, const char *origname, int fd, struct filebuf *fbp, char *realname, struct link_map *loader, int l_type, int mode, - void **stack_endp, Lmid_t nsid) + const void *stack_endp, Lmid_t nsid) { struct link_map *l = NULL; const ElfW(Ehdr) *header; @@ -2180,7 +2180,7 @@ _dl_map_object (struct link_map *loader, const char *name, void *stack_end = __libc_stack_end; return _dl_map_object_from_fd (name, origname, fd, &fb, realname, loader, - type, mode, &stack_end, nsid); + type, mode, stack_end, nsid); } struct add_path_state diff --git a/elf/dl-reloc-static-pie.c b/elf/dl-reloc-static-pie.c index e34bf5f7c..758bf9893 100644 --- a/elf/dl-reloc-static-pie.c +++ b/elf/dl-reloc-static-pie.c @@ -51,7 +51,8 @@ _dl_relocate_static_pie (void) switch (ph->p_type) { case PT_LOAD: - if (ph->p_offset == 0) + /* Skip the empty PT_LOAD segment at offset 0. */ + if (ph->p_filesz != 0 && ph->p_offset == 0) file_p_vaddr = ph->p_vaddr; break; case PT_DYNAMIC: diff --git a/elf/dl-support.c b/elf/dl-support.c index a7d5a5e8a..0388e2344 100644 --- a/elf/dl-support.c +++ b/elf/dl-support.c @@ -332,9 +332,7 @@ _dl_non_dynamic_init (void) break; } - if ((__glibc_unlikely (GL(dl_stack_flags)) & PF_X) - && TUNABLE_GET (glibc, rtld, execstack, int32_t, NULL) == 0) - _dl_fatal_printf ("Fatal glibc error: executable stack is not allowed\n"); + _dl_handle_execstack_tunable (); call_function_static_weak (_dl_find_object_init); diff --git a/elf/dl-tls.c b/elf/dl-tls.c index 8306a39e8..5686df5ad 100644 --- a/elf/dl-tls.c +++ b/elf/dl-tls.c @@ -560,6 +560,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid) if (newp == NULL) oom (); memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t)); +#ifdef SHARED + /* Auditors can trigger a DTV resize event while the full malloc + is not yet in use. Mark the new DTV allocation as the + initial allocation. */ + if (!__rtld_malloc_is_complete ()) + GL(dl_initial_dtv) = &newp[1]; +#endif } else { diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list index 0b6721bc5..c03c9967f 100644 --- a/elf/dl-tunables.list +++ b/elf/dl-tunables.list @@ -138,7 +138,7 @@ glibc { execstack { type: INT_32 minval: 0 - maxval: 1 + maxval: 2 default: 1 } } diff --git a/elf/rtld.c b/elf/rtld.c index 00bec1531..c1e9721de 100644 --- a/elf/rtld.c +++ b/elf/rtld.c @@ -1242,6 +1242,60 @@ rtld_setup_main_map (struct link_map *main_map) return has_interp; } +/* Set up the program header information for the dynamic linker + itself. It can be accessed via _r_debug and dl_iterate_phdr + callbacks, and it is used by _dl_find_object. */ +static void +rtld_setup_phdr (void) +{ + /* Starting from binutils-2.23, the linker will define the magic + symbol __ehdr_start to point to our own ELF header if it is + visible in a segment that also includes the phdrs. */ + + const ElfW(Ehdr) *rtld_ehdr = &__ehdr_start; + assert (rtld_ehdr->e_ehsize == sizeof *rtld_ehdr); + assert (rtld_ehdr->e_phentsize == sizeof (ElfW(Phdr))); + + const ElfW(Phdr) *rtld_phdr = (const void *) rtld_ehdr + rtld_ehdr->e_phoff; + + _dl_rtld_map.l_phdr = rtld_phdr; + _dl_rtld_map.l_phnum = rtld_ehdr->e_phnum; + + + _dl_rtld_map.l_contiguous = 1; + /* The linker may not have produced a contiguous object. The kernel + will load the object with actual gaps (unlike the glibc loader + for shared objects, which always produces a contiguous mapping). + See similar logic in rtld_setup_main_map above. */ + { + ElfW(Addr) expected_load_address = 0; + for (const ElfW(Phdr) *ph = rtld_phdr; ph < &rtld_phdr[rtld_ehdr->e_phnum]; + ++ph) + if (ph->p_type == PT_LOAD) + { + ElfW(Addr) mapstart = ph->p_vaddr & ~(GLRO(dl_pagesize) - 1); + if (_dl_rtld_map.l_contiguous && expected_load_address != 0 + && expected_load_address != mapstart) + _dl_rtld_map.l_contiguous = 0; + ElfW(Addr) allocend = ph->p_vaddr + ph->p_memsz; + /* The next expected address is the page following this load + segment. */ + expected_load_address = ((allocend + GLRO(dl_pagesize) - 1) + & ~(GLRO(dl_pagesize) - 1)); + } + } + + /* PT_GNU_RELRO is usually the last phdr. */ + size_t cnt = rtld_ehdr->e_phnum; + while (cnt-- > 0) + if (rtld_phdr[cnt].p_type == PT_GNU_RELRO) + { + _dl_rtld_map.l_relro_addr = rtld_phdr[cnt].p_vaddr; + _dl_rtld_map.l_relro_size = rtld_phdr[cnt].p_memsz; + break; + } +} + /* Adjusts the contents of the stack and related globals for the user entry point. The ld.so processed skip_args arguments and bumped _dl_argv and _dl_argc accordingly. Those arguments are removed from @@ -1626,9 +1680,9 @@ dl_main (const ElfW(Phdr) *phdr, bool has_interp = rtld_setup_main_map (main_map); - if ((__glibc_unlikely (GL(dl_stack_flags)) & PF_X) - && TUNABLE_GET (glibc, rtld, execstack, int32_t, NULL) == 0) - _dl_fatal_printf ("Fatal glibc error: executable stack is not allowed\n"); + /* Handle this after PT_GNU_STACK parse, because it updates dl_stack_flags + if required. */ + _dl_handle_execstack_tunable (); /* If the current libname is different from the SONAME, add the latter as well. */ @@ -1710,33 +1764,7 @@ dl_main (const ElfW(Phdr) *phdr, ++GL(dl_ns)[LM_ID_BASE]._ns_nloaded; ++GL(dl_load_adds); - /* Starting from binutils-2.23, the linker will define the magic symbol - __ehdr_start to point to our own ELF header if it is visible in a - segment that also includes the phdrs. If that's not available, we use - the old method that assumes the beginning of the file is part of the - lowest-addressed PT_LOAD segment. */ - - /* Set up the program header information for the dynamic linker - itself. It is needed in the dl_iterate_phdr callbacks. */ - const ElfW(Ehdr) *rtld_ehdr = &__ehdr_start; - assert (rtld_ehdr->e_ehsize == sizeof *rtld_ehdr); - assert (rtld_ehdr->e_phentsize == sizeof (ElfW(Phdr))); - - const ElfW(Phdr) *rtld_phdr = (const void *) rtld_ehdr + rtld_ehdr->e_phoff; - - _dl_rtld_map.l_phdr = rtld_phdr; - _dl_rtld_map.l_phnum = rtld_ehdr->e_phnum; - - - /* PT_GNU_RELRO is usually the last phdr. */ - size_t cnt = rtld_ehdr->e_phnum; - while (cnt-- > 0) - if (rtld_phdr[cnt].p_type == PT_GNU_RELRO) - { - _dl_rtld_map.l_relro_addr = rtld_phdr[cnt].p_vaddr; - _dl_rtld_map.l_relro_size = rtld_phdr[cnt].p_memsz; - break; - } + rtld_setup_phdr (); /* Add the dynamic linker to the TLS list if it also uses TLS. */ if (_dl_rtld_map.l_tls_blocksize != 0) diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c new file mode 100644 index 000000000..7ba2c4129 --- /dev/null +++ b/elf/tst-audit-tlsdesc-dlopen2.c @@ -0,0 +1,46 @@ +/* Loading TLS-using modules from auditors (bug 32412). Main program. + Copyright (C) 2021-2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +static int +do_test (void) +{ + puts ("info: start of main program"); + + /* Load TLS-using modules, to trigger DTV resizing. The dynamic + linker will load them again (requiring their own TLS) because the + dlopen calls from the auditor were in the auditing namespace. */ + for (int i = 1; i <= 19; ++i) + { + char dso[30]; + snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i); + char sym[30]; + snprintf (sym, sizeof(sym), "tlsmod17a%d", i); + + void *handle = xdlopen (dso, RTLD_LAZY); + int (*func) (void) = xdlsym (handle, sym); + /* Trigger TLS allocation. */ + func (); + } + + return 0; +} + +#include diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c new file mode 100644 index 000000000..50275cd34 --- /dev/null +++ b/elf/tst-auditmod-tlsdesc2.c @@ -0,0 +1,59 @@ +/* Loading TLS-using modules from auditors (bug 32412). Audit module. + Copyright (C) 2021-2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include + +unsigned int +la_version (unsigned int version) +{ + /* Open some modules, to trigger DTV resizing before the switch to + the main malloc. */ + for (int i = 1; i <= 19; ++i) + { + char dso[30]; + snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i); + char sym[30]; + snprintf (sym, sizeof(sym), "tlsmod17a%d", i); + + void *handle = dlopen (dso, RTLD_LAZY); + if (handle == NULL) + { + printf ("error: dlmopen from auditor: %s\n", dlerror ()); + fflush (stdout); + _exit (1); + } + int (*func) (void) = dlsym (handle, sym); + if (func == NULL) + { + printf ("error: dlsym from auditor: %s\n", dlerror ()); + fflush (stdout); + _exit (1); + } + /* Trigger TLS allocation. */ + func (); + } + + puts ("info: TLS-using modules loaded from auditor"); + fflush (stdout); + + return LAV_CURRENT; +} diff --git a/elf/tst-dlopen-sgid-mod.c b/elf/tst-dlopen-sgid-mod.c new file mode 100644 index 000000000..5eb79eef4 --- /dev/null +++ b/elf/tst-dlopen-sgid-mod.c @@ -0,0 +1 @@ +/* Opening this object should not succeed. */ diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c new file mode 100644 index 000000000..8aec52e19 --- /dev/null +++ b/elf/tst-dlopen-sgid.c @@ -0,0 +1,106 @@ +/* Test case for ignored LD_LIBRARY_PATH in static startug (bug 32976). + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* This is the name of our test object. Use a custom module for + testing, so that this object does not get picked up from the system + path. */ +static const char dso_name[] = "tst-dlopen-sgid-mod.so"; + +/* Used to mark the recursive invocation. */ +static const char magic_argument[] = "run-actual-test"; + +static int +do_test (void) +{ +/* Pathname of the directory that receives the shared objects this + test attempts to load. */ + char *libdir = support_create_temp_directory ("tst-dlopen-sgid-"); + + /* This is supposed to be ignored and stripped. */ + TEST_COMPARE (setenv ("LD_LIBRARY_PATH", libdir, 1), 0); + + /* Copy of libc.so.6. */ + { + char *from = xasprintf ("%s/%s", support_objdir_root, LIBC_SO); + char *to = xasprintf ("%s/%s", libdir, LIBC_SO); + add_temp_file (to); + support_copy_file (from, to); + free (to); + free (from); + } + + /* Copy of the test object. */ + { + char *from = xasprintf ("%s/elf/%s", support_objdir_root, dso_name); + char *to = xasprintf ("%s/%s", libdir, dso_name); + add_temp_file (to); + support_copy_file (from, to); + free (to); + free (from); + } + + free (libdir); + + support_capture_subprogram_self_sgid (magic_argument); + + return 0; +} + +static void +alternative_main (int argc, char **argv) +{ + if (argc == 2 && strcmp (argv[1], magic_argument) == 0) + { + if (getgid () == getegid ()) + /* This can happen if the file system is mounted nosuid. */ + FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n", + (intmax_t) getgid ()); + + /* Should be removed due to SGID. */ + TEST_COMPARE_STRING (getenv ("LD_LIBRARY_PATH"), NULL); + + TEST_VERIFY (dlopen (dso_name, RTLD_NOW) == NULL); + { + const char *message = dlerror (); + TEST_COMPARE_STRING (message, + "tst-dlopen-sgid-mod.so:" + " cannot open shared object file:" + " No such file or directory"); + } + + support_record_failure_barrier (); + exit (EXIT_SUCCESS); + } +} + +#define PREPARE alternative_main +#include diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c index a4233b172..bfdb30cbd 100644 --- a/elf/tst-env-setuid-tunables.c +++ b/elf/tst-env-setuid-tunables.c @@ -105,10 +105,7 @@ do_test (int argc, char **argv) if (ret != 0) exit (1); - - /* Special return code to make sure that the child executed all the way - through. */ - exit (42); + return 0; } else { @@ -127,18 +124,7 @@ do_test (int argc, char **argv) continue; } - int status = support_capture_subprogram_self_sgid (buf); - - /* Bail out early if unsupported. */ - if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) - return EXIT_UNSUPPORTED; - - if (WEXITSTATUS (status) != 42) - { - printf (" [%d] child failed with status %d\n", i, - WEXITSTATUS (status)); - support_record_failure (); - } + support_capture_subprogram_self_sgid (buf); } return 0; } diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c index 2c632ed30..7209acd61 100644 --- a/elf/tst-env-setuid.c +++ b/elf/tst-env-setuid.c @@ -147,10 +147,7 @@ do_test (int argc, char **argv) if (ret != 0) exit (1); - - /* Special return code to make sure that the child executed all the way - through. */ - exit (42); + return 0; } else { @@ -174,17 +171,7 @@ do_test (int argc, char **argv) free (profilepath); } - int status = support_capture_subprogram_self_sgid (SETGID_CHILD); - - if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) - exit (EXIT_UNSUPPORTED); - - if (WEXITSTATUS (status) != 42) - { - printf (" child failed with status %d\n", - WEXITSTATUS (status)); - support_record_failure (); - } + support_capture_subprogram_self_sgid (SETGID_CHILD); return 0; } diff --git a/elf/tst-execstack-prog-static-tunable.c b/elf/tst-execstack-prog-static-tunable.c new file mode 100644 index 000000000..88b0ca126 --- /dev/null +++ b/elf/tst-execstack-prog-static-tunable.c @@ -0,0 +1 @@ +#include diff --git a/elf/tst-execstack-tunable.c b/elf/tst-execstack-tunable.c new file mode 100644 index 000000000..9f03b0f7c --- /dev/null +++ b/elf/tst-execstack-tunable.c @@ -0,0 +1 @@ +#include diff --git a/elf/tst-link-map-contiguous-ldso.c b/elf/tst-link-map-contiguous-ldso.c new file mode 100644 index 000000000..04de808bb --- /dev/null +++ b/elf/tst-link-map-contiguous-ldso.c @@ -0,0 +1,98 @@ +/* Check that _dl_find_object behavior matches up with gaps. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +do_test (void) +{ + struct link_map *l = xdlopen (LD_SO, RTLD_NOW); + if (!l->l_contiguous) + { + puts ("info: ld.so link map is not contiguous"); + + /* Try to find holes by probing with mmap. */ + int pagesize = getpagesize (); + bool gap_found = false; + ElfW(Addr) addr = l->l_map_start; + TEST_COMPARE (addr % pagesize, 0); + while (addr < l->l_map_end) + { + void *expected = (void *) addr; + void *ptr = xmmap (expected, 1, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1); + struct dl_find_object dlfo; + int dlfo_ret = _dl_find_object (expected, &dlfo); + if (ptr == expected) + { + if (dlfo_ret < 0) + { + TEST_COMPARE (dlfo_ret, -1); + printf ("info: hole without mapping data found at %p\n", ptr); + } + else + FAIL ("object \"%s\" found in gap at %p", + dlfo.dlfo_link_map->l_name, ptr); + gap_found = true; + } + else if (dlfo_ret == 0) + { + if ((void *) dlfo.dlfo_link_map != (void *) l) + { + printf ("info: object \"%s\" found at %p\n", + dlfo.dlfo_link_map->l_name, ptr); + gap_found = true; + } + } + else + TEST_COMPARE (dlfo_ret, -1); + xmunmap (ptr, 1); + addr += pagesize; + } + if (!gap_found) + FAIL ("no ld.so gap found"); + } + else + { + puts ("info: ld.so link map is contiguous"); + + /* Assert that ld.so is truly contiguous in memory. */ + volatile long int *p = (volatile long int *) l->l_map_start; + volatile long int *end = (volatile long int *) l->l_map_end; + while (p < end) + { + *p; + ++p; + } + } + + xdlclose (l); + + return 0; +} + +#include diff --git a/elf/tst-link-map-contiguous-libc.c b/elf/tst-link-map-contiguous-libc.c new file mode 100644 index 000000000..eb5728c76 --- /dev/null +++ b/elf/tst-link-map-contiguous-libc.c @@ -0,0 +1,57 @@ +/* Check that the entire libc.so program image is readable if contiguous. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include + +static int +do_test (void) +{ + struct link_map *l = xdlopen (LIBC_SO, RTLD_NOW); + + /* The dynamic loader fills holes with PROT_NONE mappings. */ + if (!l->l_contiguous) + FAIL_EXIT1 ("libc.so link map is not contiguous"); + + /* Direct probing does not work because not everything is readable + due to PROT_NONE mappings. */ + int pagesize = getpagesize (); + ElfW(Addr) addr = l->l_map_start; + TEST_COMPARE (addr % pagesize, 0); + while (addr < l->l_map_end) + { + void *expected = (void *) addr; + void *ptr = xmmap (expected, 1, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1); + if (ptr == expected) + FAIL ("hole in libc.so memory image after %lu bytes", + (unsigned long int) (addr - l->l_map_start)); + xmunmap (ptr, 1); + addr += pagesize; + } + + xdlclose (l); + + return 0; +} +#include diff --git a/elf/tst-link-map-contiguous-main.c b/elf/tst-link-map-contiguous-main.c new file mode 100644 index 000000000..2d1a054f0 --- /dev/null +++ b/elf/tst-link-map-contiguous-main.c @@ -0,0 +1,45 @@ +/* Check that the entire main program image is readable if contiguous. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include + +static int +do_test (void) +{ + struct link_map *l = xdlopen ("", RTLD_NOW); + if (!l->l_contiguous) + FAIL_UNSUPPORTED ("main link map is not contiguous"); + + /* This check only works if the kernel loaded the main program. The + dynamic loader replaces gaps with PROT_NONE mappings, resulting + in faults. */ + volatile long int *p = (volatile long int *) l->l_map_start; + volatile long int *end = (volatile long int *) l->l_map_end; + while (p < end) + { + *p; + ++p; + } + + xdlclose (l); + + return 0; +} +#include diff --git a/elf/tst-pie-bss-static.c b/elf/tst-pie-bss-static.c new file mode 100644 index 000000000..5df542f9e --- /dev/null +++ b/elf/tst-pie-bss-static.c @@ -0,0 +1,19 @@ +/* Test static PIE with an empty PT_LOAD segment at offset 0. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "tst-pie-bss.c" diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/elf/tst-pie-bss.c similarity index 66% rename from sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S rename to elf/tst-pie-bss.c index 7b45fcd63..ee9275424 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S +++ b/elf/tst-pie-bss.c @@ -1,5 +1,5 @@ -/* Optimized strcmp implementation for POWER10/PPC64. - Copyright (C) 2021-2025 Free Software Foundation, Inc. +/* Test PIE with an empty PT_LOAD segment at offset 0. + Copyright (C) 2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,11 +16,15 @@ License along with the GNU C Library; if not, see . */ -#if defined __LITTLE_ENDIAN__ && IS_IN (libc) -#define STRCMP __strcmp_power10 +#include -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(name) +char bss[0xb5dce8] __attribute__ ((aligned (65536))); -#include -#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */ +static int +do_test (void) +{ + printf ("Hello\n"); + return 0; +} + +#include diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp index 9f5990f34..8df6f5906 100644 --- a/elf/tst-rtld-list-tunables.exp +++ b/elf/tst-rtld-list-tunables.exp @@ -13,6 +13,6 @@ glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0x[f]+) glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0x[f]+) glibc.rtld.dynamic_sort: 2 (min: 1, max: 2) glibc.rtld.enable_secure: 0 (min: 0, max: 1) -glibc.rtld.execstack: 1 (min: 0, max: 1) +glibc.rtld.execstack: 1 (min: 0, max: 2) glibc.rtld.nns: 0x4 (min: 0x1, max: 0x10) glibc.rtld.optional_static_tls: 0x200 (min: 0x0, max: 0x[f]+) diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c index 7dba5d8df..558cfb11a 100644 --- a/iconv/iconv_prog.c +++ b/iconv/iconv_prog.c @@ -436,7 +436,7 @@ input_error (const char *path) static void open_output_direct (void) { - output_fd = open64 (output_file, O_WRONLY | O_CREAT | O_TRUNC, 0777); + output_fd = open64 (output_file, O_WRONLY | O_CREAT | O_TRUNC, 0666); if (output_fd < 0) output_error (); } @@ -457,7 +457,7 @@ prepare_output_file (char **argv) else { /* If iconv creates the output file, no overlap is possible. */ - output_fd = open64 (output_file, O_WRONLY | O_CREAT | O_EXCL, 0777); + output_fd = open64 (output_file, O_WRONLY | O_CREAT | O_EXCL, 0666); if (output_fd >= 0) output_buffer_size = copy_buffer_size; else diff --git a/iconv/tst-iconv_prog-buffer.sh b/iconv/tst-iconv_prog-buffer.sh index 1c499d590..40340c38f 100644 --- a/iconv/tst-iconv_prog-buffer.sh +++ b/iconv/tst-iconv_prog-buffer.sh @@ -75,6 +75,10 @@ run_iconv () { } check_out_expected () { + if test -x "$tmp/out" ; then + echo "error: iconv output file is executable" + failure=true + fi if ! cmp -s "$tmp/out" "$tmp/expected" ; then echo "error: iconv output difference" >&$logfd echo "*** expected ***" >&$logfd diff --git a/include/dlfcn.h b/include/dlfcn.h index f49ee1b0c..a44420fa3 100644 --- a/include/dlfcn.h +++ b/include/dlfcn.h @@ -4,8 +4,7 @@ #include /* For ElfW. */ #include -extern __typeof (_dl_find_object) __dl_find_object; -hidden_proto (__dl_find_object) +rtld_hidden_proto (_dl_find_object) /* Internally used flag. */ #define __RTLD_DLOPEN 0x80000000 diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in index 01ba689aa..4f194da19 100644 --- a/math/auto-libm-test-in +++ b/math/auto-libm-test-in @@ -7291,6 +7291,8 @@ log10p1 -0x1p-125 log10p1 -0x1p-1021 log10p1 -0x1p-16381 +log10p1 0x1.27f7dap-17 + log10p1 0x7.2a4368p-4 log10p1 0x6.d3a118p-4 log10p1 0x5.03f228p+0 @@ -8298,6 +8300,7 @@ sinh -0x1.3dda8ap+0 sinh -0x5.ee9218p-4 sinh -0x1.bcfc98p+0 sinh -0x6.9bbb6df7c5d08p-4 +sinh 0x1.250bfep-11 # the next value generates larger error bounds on x86_64 (ldbl-96) sinh 0x2.c5d376167f4052f4p+12 sinh max @@ -8661,6 +8664,7 @@ tan 0x1.1ad374p+0 tan -0x1.0d55b8p+0 tan 1.57079697 tan -1.57079697 +tan 0x1.ada6aap+27 tan 0x1p-5 tan 0x1p-10 tan 0x1p-15 diff --git a/math/auto-libm-test-out-log10p1 b/math/auto-libm-test-out-log10p1 index 87bdb0bcd..f5ce96572 100644 --- a/math/auto-libm-test-out-log10p1 +++ b/math/auto-libm-test-out-log10p1 @@ -1789,6 +1789,31 @@ log10p1 -0x1p-16381 = log10p1 tonearest binary128 -0x8p-16384 : -0x3.796f62a4dca1c654d56eaabeb4dp-16384 : inexact-ok underflow errno-erange-ok = log10p1 towardzero binary128 -0x8p-16384 : -0x3.796f62a4dca1c654d56eaabeb4ccp-16384 : inexact-ok underflow errno-erange-ok = log10p1 upward binary128 -0x8p-16384 : -0x3.796f62a4dca1c654d56eaabeb4ccp-16384 : inexact-ok underflow errno-erange-ok +log10p1 0x1.27f7dap-17 += log10p1 downward binary32 0x9.3fbedp-20 : 0x4.044b5p-20 : inexact-ok += log10p1 tonearest binary32 0x9.3fbedp-20 : 0x4.044b5p-20 : inexact-ok += log10p1 towardzero binary32 0x9.3fbedp-20 : 0x4.044b5p-20 : inexact-ok += log10p1 upward binary32 0x9.3fbedp-20 : 0x4.044b58p-20 : inexact-ok += log10p1 downward binary64 0x9.3fbedp-20 : 0x4.044b5157872ep-20 : inexact-ok += log10p1 tonearest binary64 0x9.3fbedp-20 : 0x4.044b5157872e4p-20 : inexact-ok += log10p1 towardzero binary64 0x9.3fbedp-20 : 0x4.044b5157872ep-20 : inexact-ok += log10p1 upward binary64 0x9.3fbedp-20 : 0x4.044b5157872e4p-20 : inexact-ok += log10p1 downward intel96 0x9.3fbedp-20 : 0x4.044b5157872e2868p-20 : inexact-ok += log10p1 tonearest intel96 0x9.3fbedp-20 : 0x4.044b5157872e2868p-20 : inexact-ok += log10p1 towardzero intel96 0x9.3fbedp-20 : 0x4.044b5157872e2868p-20 : inexact-ok += log10p1 upward intel96 0x9.3fbedp-20 : 0x4.044b5157872e287p-20 : inexact-ok += log10p1 downward m68k96 0x9.3fbedp-20 : 0x4.044b5157872e2868p-20 : inexact-ok += log10p1 tonearest m68k96 0x9.3fbedp-20 : 0x4.044b5157872e2868p-20 : inexact-ok += log10p1 towardzero m68k96 0x9.3fbedp-20 : 0x4.044b5157872e2868p-20 : inexact-ok += log10p1 upward m68k96 0x9.3fbedp-20 : 0x4.044b5157872e287p-20 : inexact-ok += log10p1 downward binary128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d808p-20 : inexact-ok += log10p1 tonearest binary128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d80cp-20 : inexact-ok += log10p1 towardzero binary128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d808p-20 : inexact-ok += log10p1 upward binary128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d80cp-20 : inexact-ok += log10p1 downward ibm128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d8p-20 : inexact-ok += log10p1 tonearest ibm128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d8p-20 : inexact-ok += log10p1 towardzero ibm128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d8p-20 : inexact-ok += log10p1 upward ibm128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287dap-20 : inexact-ok log10p1 0x7.2a4368p-4 = log10p1 downward binary32 0x7.2a4368p-4 : 0x2.9248dcp-4 : inexact-ok = log10p1 tonearest binary32 0x7.2a4368p-4 : 0x2.9248ep-4 : inexact-ok diff --git a/math/auto-libm-test-out-sinh b/math/auto-libm-test-out-sinh index 0b77a77ee..3924e19d8 100644 --- a/math/auto-libm-test-out-sinh +++ b/math/auto-libm-test-out-sinh @@ -2115,6 +2115,31 @@ sinh -0x6.9bbb6df7c5d08p-4 = sinh tonearest ibm128 -0x6.9bbb6df7c5d08p-4 : -0x6.cc3ddf003dcda77f8f9e892e36p-4 : inexact-ok = sinh towardzero ibm128 -0x6.9bbb6df7c5d08p-4 : -0x6.cc3ddf003dcda77f8f9e892e36p-4 : inexact-ok = sinh upward ibm128 -0x6.9bbb6df7c5d08p-4 : -0x6.cc3ddf003dcda77f8f9e892e36p-4 : inexact-ok +sinh 0x1.250bfep-11 += sinh downward binary32 0x2.4a17fcp-12 : 0x2.4a17fcp-12 : inexact-ok += sinh tonearest binary32 0x2.4a17fcp-12 : 0x2.4a17fcp-12 : inexact-ok += sinh towardzero binary32 0x2.4a17fcp-12 : 0x2.4a17fcp-12 : inexact-ok += sinh upward binary32 0x2.4a17fcp-12 : 0x2.4a18p-12 : inexact-ok += sinh downward binary64 0x2.4a17fcp-12 : 0x2.4a17fdffffffep-12 : inexact-ok += sinh tonearest binary64 0x2.4a17fcp-12 : 0x2.4a17fep-12 : inexact-ok += sinh towardzero binary64 0x2.4a17fcp-12 : 0x2.4a17fdffffffep-12 : inexact-ok += sinh upward binary64 0x2.4a17fcp-12 : 0x2.4a17fep-12 : inexact-ok += sinh downward intel96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87cp-12 : inexact-ok += sinh tonearest intel96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff88p-12 : inexact-ok += sinh towardzero intel96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87cp-12 : inexact-ok += sinh upward intel96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff88p-12 : inexact-ok += sinh downward m68k96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87cp-12 : inexact-ok += sinh tonearest m68k96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff88p-12 : inexact-ok += sinh towardzero m68k96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87cp-12 : inexact-ok += sinh upward m68k96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff88p-12 : inexact-ok += sinh downward binary128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786ec88p-12 : inexact-ok += sinh tonearest binary128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786ec8ap-12 : inexact-ok += sinh towardzero binary128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786ec88p-12 : inexact-ok += sinh upward binary128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786ec8ap-12 : inexact-ok += sinh downward ibm128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786ecp-12 : inexact-ok += sinh tonearest ibm128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786edp-12 : inexact-ok += sinh towardzero ibm128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786ecp-12 : inexact-ok += sinh upward ibm128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786edp-12 : inexact-ok sinh 0x2.c5d376167f4052f4p+12 = sinh downward binary32 0x2.c5d378p+12 : 0xf.fffffp+124 : inexact-ok overflow errno-erange-ok = sinh tonearest binary32 0x2.c5d378p+12 : plus_infty : inexact-ok overflow errno-erange diff --git a/math/auto-libm-test-out-tan b/math/auto-libm-test-out-tan index 7d00d03e1..1d5999ab9 100644 --- a/math/auto-libm-test-out-tan +++ b/math/auto-libm-test-out-tan @@ -2532,6 +2532,31 @@ tan -1.57079697 = tan tonearest ibm128 -0x1.921fc00ece4f02f278ade6ad9fp+0 : 0x1.7b91a0851bbbafa14cf21c2b5c8p+20 : inexact-ok = tan towardzero ibm128 -0x1.921fc00ece4f02f278ade6ad9fp+0 : 0x1.7b91a0851bbbafa14cf21c2b5cp+20 : inexact-ok = tan upward ibm128 -0x1.921fc00ece4f02f278ade6ad9fp+0 : 0x1.7b91a0851bbbafa14cf21c2b5c8p+20 : inexact-ok +tan 0x1.ada6aap+27 += tan downward binary32 0xd.6d355p+24 : 0x3.d00608p-4 : inexact-ok += tan tonearest binary32 0xd.6d355p+24 : 0x3.d00608p-4 : inexact-ok += tan towardzero binary32 0xd.6d355p+24 : 0x3.d00608p-4 : inexact-ok += tan upward binary32 0xd.6d355p+24 : 0x3.d0060cp-4 : inexact-ok += tan downward binary64 0xd.6d355p+24 : 0x3.d00608p-4 : inexact-ok += tan tonearest binary64 0xd.6d355p+24 : 0x3.d00608p-4 : inexact-ok += tan towardzero binary64 0xd.6d355p+24 : 0x3.d00608p-4 : inexact-ok += tan upward binary64 0xd.6d355p+24 : 0x3.d006080000002p-4 : inexact-ok += tan downward intel96 0xd.6d355p+24 : 0x3.d006080000000504p-4 : inexact-ok += tan tonearest intel96 0xd.6d355p+24 : 0x3.d006080000000508p-4 : inexact-ok += tan towardzero intel96 0xd.6d355p+24 : 0x3.d006080000000504p-4 : inexact-ok += tan upward intel96 0xd.6d355p+24 : 0x3.d006080000000508p-4 : inexact-ok += tan downward m68k96 0xd.6d355p+24 : 0x3.d006080000000504p-4 : inexact-ok += tan tonearest m68k96 0xd.6d355p+24 : 0x3.d006080000000508p-4 : inexact-ok += tan towardzero m68k96 0xd.6d355p+24 : 0x3.d006080000000504p-4 : inexact-ok += tan upward m68k96 0xd.6d355p+24 : 0x3.d006080000000508p-4 : inexact-ok += tan downward binary128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c15ap-4 : inexact-ok += tan tonearest binary128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c15ap-4 : inexact-ok += tan towardzero binary128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c15ap-4 : inexact-ok += tan upward binary128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c15cp-4 : inexact-ok += tan downward ibm128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c1p-4 : inexact-ok += tan tonearest ibm128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c1p-4 : inexact-ok += tan towardzero ibm128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c1p-4 : inexact-ok += tan upward ibm128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c2p-4 : inexact-ok tan 0x1p-5 = tan downward binary32 0x8p-8 : 0x8.00aabp-8 : inexact-ok = tan tonearest binary32 0x8p-8 : 0x8.00aacp-8 : inexact-ok diff --git a/math/bits/mathcalls-macros.h b/math/bits/mathcalls-macros.h index 1ef07f1f5..321ae00ec 100644 --- a/math/bits/mathcalls-macros.h +++ b/math/bits/mathcalls-macros.h @@ -34,7 +34,7 @@ #define __MATHCALLX(function,suffix, args, attrib) \ __MATHDECLX (_Mdouble_,function,suffix, args, attrib) #define __MATHDECLX(type, function,suffix, args, attrib) \ - __MATHDECL_1(type, function,suffix, args) __attribute__ (attrib); + __MATHDECL_1(type, function,suffix, args) __attribute__ (attrib) #define __MATHDECL_1_IMPL(type, function, suffix, args) \ extern type __MATH_PRECNAME(function,suffix) args __THROW #define __MATHDECL_1(type, function, suffix, args) \ diff --git a/nptl/Makefile b/nptl/Makefile index 82621c795..4be778ad6 100644 --- a/nptl/Makefile +++ b/nptl/Makefile @@ -701,6 +701,9 @@ $(objpfx)tst-execstack-threads.out: $(objpfx)tst-execstack-threads-mod.so LDFLAGS-tst-execstack-threads = -Wl,-z,noexecstack LDFLAGS-tst-execstack-threads-mod.so = -Wl,-z,execstack CFLAGS-tst-execstack-threads-mod.c += -Wno-trampolines +ifeq ($(have-no-error-execstack),yes) +LDFLAGS-tst-execstack-threads-mod.so += -Wl,--no-error-execstack +endif tst-stackguard1-ARGS = --command "$(host-test-program-cmd) --child" tst-stackguard1-static-ARGS = --command "$(objpfx)tst-stackguard1-static --child" diff --git a/nptl/cancellation.c b/nptl/cancellation.c index 156e63dcf..bed0383a2 100644 --- a/nptl/cancellation.c +++ b/nptl/cancellation.c @@ -72,8 +72,8 @@ __syscall_cancel (__syscall_arg_t a1, __syscall_arg_t a2, __syscall_arg_t a5, __syscall_arg_t a6, __SYSCALL_CANCEL7_ARG_DEF __syscall_arg_t nr) { - int r = __internal_syscall_cancel (a1, a2, a3, a4, a5, a6, - __SYSCALL_CANCEL7_ARG nr); + long int r = __internal_syscall_cancel (a1, a2, a3, a4, a5, a6, + __SYSCALL_CANCEL7_ARG nr); return __glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (r)) ? SYSCALL_ERROR_LABEL (INTERNAL_SYSCALL_ERRNO (r)) : r; diff --git a/nptl/pthread_cancel.c b/nptl/pthread_cancel.c index f7ce3ec51..b83827388 100644 --- a/nptl/pthread_cancel.c +++ b/nptl/pthread_cancel.c @@ -41,15 +41,17 @@ sigcancel_handler (int sig, siginfo_t *si, void *ctx) || si->si_code != SI_TKILL) return; - /* Check if asynchronous cancellation mode is set or if interrupted - instruction pointer falls within the cancellable syscall bridge. For - interruptable syscalls with external side-effects (i.e. partial reads), - the kernel will set the IP to after __syscall_cancel_arch_end, thus - disabling the cancellation and allowing the process to handle such + /* Check if asynchronous cancellation mode is set and cancellation is not + already in progress, or if interrupted instruction pointer falls within + the cancellable syscall bridge. + For interruptable syscalls with external side-effects (i.e. partial + reads), the kernel will set the IP to after __syscall_cancel_arch_end, + thus disabling the cancellation and allowing the process to handle such conditions. */ struct pthread *self = THREAD_SELF; int oldval = atomic_load_relaxed (&self->cancelhandling); - if (cancel_async_enabled (oldval) || cancellation_pc_check (ctx)) + if (cancel_enabled_and_canceled_and_async (oldval) + || cancellation_pc_check (ctx)) __syscall_do_cancel (); } diff --git a/nptl/pthread_getattr_np.c b/nptl/pthread_getattr_np.c index e98e2df15..43dd16d59 100644 --- a/nptl/pthread_getattr_np.c +++ b/nptl/pthread_getattr_np.c @@ -145,9 +145,9 @@ __pthread_getattr_np (pthread_t thread_id, pthread_attr_t *attr) > (size_t) iattr->stackaddr - last_to) iattr->stacksize = (size_t) iattr->stackaddr - last_to; #else - /* The limit might be too high. */ + /* The limit might be too low. */ if ((size_t) iattr->stacksize - > to - (size_t) iattr->stackaddr) + < to - (size_t) iattr->stackaddr) iattr->stacksize = to - (size_t) iattr->stackaddr; #endif /* We succeed and no need to look further. */ diff --git a/posix/Makefile b/posix/Makefile index a650abf59..0e209a7ed 100644 --- a/posix/Makefile +++ b/posix/Makefile @@ -303,6 +303,7 @@ tests := \ tst-posix_spawn-setsid \ tst-preadwrite \ tst-preadwrite64 \ + tst-regcomp-bracket-free \ tst-regcomp-truncated \ tst-regex \ tst-regex2 \ diff --git a/posix/environ.c b/posix/environ.c index a0ed0d80e..924effe3c 100644 --- a/posix/environ.c +++ b/posix/environ.c @@ -2,6 +2,7 @@ #include #include +#include /* This must be initialized; we cannot have a weak alias into bss. */ char **__environ = NULL; @@ -10,3 +11,6 @@ weak_alias (__environ, environ) /* The SVR4 ABI says `_environ' will be the name to use in case the user overrides the weak alias `environ'. */ weak_alias (__environ, _environ) + +struct environ_array *__environ_array_list; +environ_counter __environ_counter; diff --git a/posix/regcomp.c b/posix/regcomp.c index 69675d81f..5c486cee5 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -3384,6 +3384,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, { #ifdef RE_ENABLE_I18N free_charset (mbcset); + mbcset = NULL; #endif /* Build a tree for simple bracket. */ br_token.type = SIMPLE_BRACKET; @@ -3399,7 +3400,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, parse_bracket_exp_free_return: re_free (sbcset); #ifdef RE_ENABLE_I18N - free_charset (mbcset); + if (__glibc_likely (mbcset != NULL)) + free_charset (mbcset); #endif /* RE_ENABLE_I18N */ return NULL; } diff --git a/posix/tst-regcomp-bracket-free.c b/posix/tst-regcomp-bracket-free.c new file mode 100644 index 000000000..3c091d8c4 --- /dev/null +++ b/posix/tst-regcomp-bracket-free.c @@ -0,0 +1,176 @@ +/* Test regcomp bracket parsing with injected allocation failures (bug 33185). + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* This test invokes regcomp multiple times, failing one memory + allocation in each call. The function call should fail with + REG_ESPACE (or succeed if it can recover from the allocation + failure). Previously, there was double-free bug. */ + +#include +#include +#include +#include +#include +#include +#include + +/* Data structure allocated via MAP_SHARED, so that writes from the + subprocess are visible. */ +struct shared_data +{ + /* Number of tracked allocations performed so far. */ + volatile unsigned int allocation_count; + + /* If this number is reached, one allocation fails. */ + volatile unsigned int failing_allocation; + + /* The subprocess stores the expected name here. */ + char name[100]; +}; + +/* Allocation count in shared mapping. */ +static struct shared_data *shared; + +/* Returns true if a failure should be injected for this allocation. */ +static bool +fail_this_allocation (void) +{ + if (shared != NULL) + { + unsigned int count = shared->allocation_count; + shared->allocation_count = count + 1; + return count == shared->failing_allocation; + } + else + return false; +} + +/* Failure-injecting wrappers for allocation functions used by glibc. */ + +void * +malloc (size_t size) +{ + if (fail_this_allocation ()) + { + errno = ENOMEM; + return NULL; + } + extern __typeof (malloc) __libc_malloc; + return __libc_malloc (size); +} + +void * +calloc (size_t a, size_t b) +{ + if (fail_this_allocation ()) + { + errno = ENOMEM; + return NULL; + } + extern __typeof (calloc) __libc_calloc; + return __libc_calloc (a, b); +} + +void * +realloc (void *ptr, size_t size) +{ + if (fail_this_allocation ()) + { + errno = ENOMEM; + return NULL; + } + extern __typeof (realloc) __libc_realloc; + return __libc_realloc (ptr, size); +} + +/* No-op subprocess to verify that support_isolate_in_subprocess does + not perform any heap allocations. */ +static void +no_op (void *ignored) +{ +} + +/* Perform a regcomp call in a subprocess. Used to count its + allocations. */ +static void +initialize (void *regexp1) +{ + const char *regexp = regexp1; + + shared->allocation_count = 0; + + regex_t reg; + TEST_COMPARE (regcomp (®, regexp, 0), 0); +} + +/* Perform regcomp in a subprocess with fault injection. */ +static void +test_in_subprocess (void *regexp1) +{ + const char *regexp = regexp1; + unsigned int inject_at = shared->failing_allocation; + + regex_t reg; + int ret = regcomp (®, regexp, 0); + + if (ret != 0) + { + TEST_COMPARE (ret, REG_ESPACE); + printf ("info: allocation %u failure results in return value %d," + " error %s (%d)\n", + inject_at, ret, strerrorname_np (errno), errno); + } +} + +static int +do_test (void) +{ + char regexp[] = "[:alpha:]"; + + shared = support_shared_allocate (sizeof (*shared)); + + /* Disable fault injection. */ + shared->failing_allocation = ~0U; + + support_isolate_in_subprocess (no_op, NULL); + TEST_COMPARE (shared->allocation_count, 0); + + support_isolate_in_subprocess (initialize, regexp); + + /* The number of allocations in the successful case, plus some + slack. Once the number of expected allocations is exceeded, + injecting further failures does not make a difference. */ + unsigned int maximum_allocation_count = shared->allocation_count; + printf ("info: successful call performs %u allocations\n", + maximum_allocation_count); + maximum_allocation_count += 10; + + for (unsigned int inject_at = 0; inject_at <= maximum_allocation_count; + ++inject_at) + { + shared->allocation_count = 0; + shared->failing_allocation = inject_at; + support_isolate_in_subprocess (test_in_subprocess, regexp); + } + + support_shared_free (shared); + + return 0; +} + +#include diff --git a/stdlib/Makefile b/stdlib/Makefile index 1c4fa2382..c9c8f702a 100644 --- a/stdlib/Makefile +++ b/stdlib/Makefile @@ -282,6 +282,7 @@ tests := \ tst-environ-change-3 \ tst-environ-change-4 \ tst-getenv-signal \ + tst-getenv-static \ tst-getenv-thread \ tst-getenv-unsetenv \ tst-getrandom \ @@ -377,6 +378,7 @@ tests-internal := \ # tests-internal tests-static := \ + tst-getenv-static \ tst-secure-getenv \ # tests-static diff --git a/stdlib/abort.c b/stdlib/abort.c index caa9e6dc0..904244a2f 100644 --- a/stdlib/abort.c +++ b/stdlib/abort.c @@ -19,6 +19,7 @@ #include #include #include +#include #include /* Try to get a machine dependent instruction which will make the @@ -42,7 +43,10 @@ __libc_rwlock_define_initialized (static, lock); void __abort_fork_reset_child (void) { - __libc_rwlock_init (lock); + /* Reinitialize lock without calling pthread_rwlock_init, to + avoid a valgrind DRD false positive. */ + __libc_rwlock_define_initialized (, reset_lock); + memcpy (&lock, &reset_lock, sizeof (lock)); } void diff --git a/stdlib/getenv.c b/stdlib/getenv.c index 5e7212cca..1a7b0bfc0 100644 --- a/stdlib/getenv.c +++ b/stdlib/getenv.c @@ -20,9 +20,6 @@ #include #include -struct environ_array *__environ_array_list; -environ_counter __environ_counter; - char * getenv (const char *name) { diff --git a/stdlib/tst-getenv-static.c b/stdlib/tst-getenv-static.c new file mode 100644 index 000000000..f5f484c83 --- /dev/null +++ b/stdlib/tst-getenv-static.c @@ -0,0 +1,38 @@ +/* Static interposition of getenv (bug 32541). + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +/* Some programs try to interpose getenv for their own use (not + glibc's internal use). Make sure that this is possible without + introducing linker failures due to duplicate symbols. */ + +char * +getenv (const char *ignored) +{ + return NULL; +} + +static int +do_test (void) +{ + TEST_COMPARE_STRING (getenv ("PATH"), NULL); + return 0; +} + +#include diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c index 3fd1d232b..c12c63aee 100644 --- a/stdlib/tst-secure-getenv.c +++ b/stdlib/tst-secure-getenv.c @@ -57,13 +57,7 @@ do_test (void) exit (1); } - int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT); - - if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) - return EXIT_UNSUPPORTED; - - if (!WIFEXITED (status)) - FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status); + support_capture_subprogram_self_sgid (MAGIC_ARGUMENT); return 0; } @@ -82,6 +76,7 @@ alternative_main (int argc, char **argv) if (secure_getenv ("PATH") != NULL) FAIL_EXIT (4, "PATH variable not filtered out\n"); + support_record_failure_barrier (); exit (EXIT_SUCCESS); } } diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h index 91d75e5d6..b37462d0d 100644 --- a/support/capture_subprocess.h +++ b/support/capture_subprocess.h @@ -42,11 +42,12 @@ struct support_capture_subprocess support_capture_subprocess struct support_capture_subprocess support_capture_subprogram (const char *file, char *const argv[], char *const envp[]); -/* Copy the running program into a setgid binary and run it with CHILD_ID - argument. If execution is successful, return the exit status of the child - program, otherwise return a non-zero failure exit code. */ -int support_capture_subprogram_self_sgid - (char *child_id); +/* Copy the running program into a setgid binary and run it with + CHILD_ID argument. If the program exits with a non-zero status, + exit with that exit status (or status 1 if the program did not exit + normally). If the test cannot be performed, exit with + EXIT_UNSUPPORTED. */ +void support_capture_subprogram_self_sgid (const char *child_id); /* Deallocate the subprocess data captured by support_capture_subprocess. */ diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c index c3ef478d1..b4e4bf950 100644 --- a/support/support_capture_subprocess.c +++ b/support/support_capture_subprocess.c @@ -21,12 +21,17 @@ #include #include +#include +#include +#include #include +#include #include #include #include #include #include +#include #include static void @@ -109,111 +114,88 @@ support_capture_subprogram (const char *file, char *const argv[], /* Copies the executable into a restricted directory, so that we can safely make it SGID with the TARGET group ID. Then runs the executable. */ -static int -copy_and_spawn_sgid (char *child_id, gid_t gid) +static void +copy_and_spawn_sgid (const char *child_id, gid_t gid) { - char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd", - test_dir, (intmax_t) getpid ()); + char *dirname = support_create_temp_directory ("tst-glibc-sgid-"); char *execname = xasprintf ("%s/bin", dirname); - int infd = -1; - int outfd = -1; - int ret = 1, status = 1; - - TEST_VERIFY (mkdir (dirname, 0700) == 0); - if (support_record_failure_is_failed ()) - goto err; + add_temp_file (execname); - infd = open ("/proc/self/exe", O_RDONLY); - if (infd < 0) + if (access ("/proc/self/exe", R_OK) != 0) FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n"); - outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700); - TEST_VERIFY (outfd >= 0); - if (support_record_failure_is_failed ()) - goto err; - - char buf[4096]; - for (;;) - { - ssize_t rdcount = read (infd, buf, sizeof (buf)); - TEST_VERIFY (rdcount >= 0); - if (support_record_failure_is_failed ()) - goto err; - if (rdcount == 0) - break; - char *p = buf; - char *end = buf + rdcount; - while (p != end) - { - ssize_t wrcount = write (outfd, buf, end - p); - if (wrcount == 0) - errno = ENOSPC; - TEST_VERIFY (wrcount > 0); - if (support_record_failure_is_failed ()) - goto err; - p += wrcount; - } - } + support_copy_file ("/proc/self/exe", execname); - bool chowned = false; - TEST_VERIFY ((chowned = fchown (outfd, getuid (), gid) == 0) - || errno == EPERM); - if (support_record_failure_is_failed ()) - goto err; - else if (!chowned) - { - ret = 77; - goto err; - } + if (chown (execname, getuid (), gid) != 0) + FAIL_UNSUPPORTED ("cannot change group of \"%s\" to %jd: %m", + execname, (intmax_t) gid); - TEST_VERIFY (fchmod (outfd, 02750) == 0); - if (support_record_failure_is_failed ()) - goto err; - TEST_VERIFY (close (outfd) == 0); - if (support_record_failure_is_failed ()) - goto err; - TEST_VERIFY (close (infd) == 0); - if (support_record_failure_is_failed ()) - goto err; + if (chmod (execname, 02750) != 0) + FAIL_UNSUPPORTED ("cannot make \"%s\" SGID: %m ", execname); /* We have the binary, now spawn the subprocess. Avoid using support_subprogram because we only want the program exit status, not the contents. */ - ret = 0; - infd = outfd = -1; - char * const args[] = {execname, child_id, NULL}; + char * const args[] = {execname, (char *) child_id, NULL}; + int status = support_subprogram_wait (args[0], args); - status = support_subprogram_wait (args[0], args); + free (execname); + free (dirname); -err: - if (outfd >= 0) - close (outfd); - if (infd >= 0) - close (infd); - if (execname != NULL) + if (WIFEXITED (status)) { - unlink (execname); - free (execname); + if (WEXITSTATUS (status) == 0) + return; + else + exit (WEXITSTATUS (status)); } - if (dirname != NULL) + else + FAIL_EXIT1 ("subprogram failed with status %d", status); +} + +/* Returns true if a group with NAME has been found, and writes its + GID to *TARGET. */ +static bool +find_sgid_group (gid_t *target, const char *name) +{ + /* Do not use getgrname_r because it does not work in statically + linked binaries if the system libc is different. */ + FILE *fp = fopen ("/etc/group", "rce"); + if (fp == NULL) + return false; + __fsetlocking (fp, FSETLOCKING_BYCALLER); + + bool ok = false; + struct scratch_buffer buf; + scratch_buffer_init (&buf); + while (true) { - rmdir (dirname); - free (dirname); + struct group grp; + struct group *result = NULL; + int status = fgetgrent_r (fp, &grp, buf.data, buf.length, &result); + if (status == 0 && result != NULL) + { + if (strcmp (result->gr_name, name) == 0) + { + *target = result->gr_gid; + ok = true; + break; + } + } + else if (errno != ERANGE) + break; + else if (!scratch_buffer_grow (&buf)) + break; } - - if (ret == 77) - FAIL_UNSUPPORTED ("Failed to make sgid executable for test\n"); - if (ret != 0) - FAIL_EXIT1 ("Failed to make sgid executable for test\n"); - - return status; + scratch_buffer_free (&buf); + fclose (fp); + return ok; } -int -support_capture_subprogram_self_sgid (char *child_id) +void +support_capture_subprogram_self_sgid (const char *child_id) { - gid_t target = 0; const int count = 64; gid_t groups[count]; @@ -225,6 +207,7 @@ support_capture_subprogram_self_sgid (char *child_id) (intmax_t) getuid ()); gid_t current = getgid (); + gid_t target = current; for (int i = 0; i < ret; ++i) { if (groups[i] != current) @@ -234,11 +217,18 @@ support_capture_subprogram_self_sgid (char *child_id) } } - if (target == 0) - FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n", - (intmax_t) getuid ()); + if (target == current) + { + /* If running as root, try to find a harmless group for SGID. */ + if (getuid () != 0 + || (!find_sgid_group (&target, "nogroup") + && !find_sgid_group (&target, "bin") + && !find_sgid_group (&target, "daemon"))) + FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n", + (intmax_t) getuid ()); + } - return copy_and_spawn_sgid (child_id, target); + copy_and_spawn_sgid (child_id, target); } void diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c index 7709b5454..453f78031 100644 --- a/sysdeps/aarch64/fpu/acos_advsimd.c +++ b/sysdeps/aarch64/fpu/acos_advsimd.c @@ -18,24 +18,23 @@ . */ #include "v_math.h" -#include "poly_advsimd_f64.h" static const struct data { - float64x2_t poly[12]; - float64x2_t pi, pi_over_2; + double c1, c3, c5, c7, c9, c11; + float64x2_t c0, c2, c4, c6, c8, c10; uint64x2_t abs_mask; + float64x2_t pi, pi_over_2; } data = { /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ - .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), - V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), - V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), - V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), - V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), - V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, - .pi = V2 (0x1.921fb54442d18p+1), - .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4, + .c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6, + .c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6, + .c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7, + .c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6, + .c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6, + .pi = V2 (0x1.921fb54442d18p+1), .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff), }; @@ -63,7 +62,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special) acos(x) ~ pi/2 - (x + x^3 P(x^2)). - The largest observed error in this region is 1.18 ulps, + The largest observed error in this region is 1.18 ulp: _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0 want 0x1.0d54d1985c069p+0. @@ -71,9 +70,9 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special) acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). - The largest observed error in this region is 1.52 ulps, - _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1 - want 0x1.edbbedf8a7d6cp-1. */ + The largest observed error in this region is 1.50 ulp: + _ZGVnN2v_acos (0x1.252a2cf3fb9acp-1) got 0x1.ec1a46aa82901p-1 + want 0x1.ec1a46aa829p-1. */ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -99,13 +98,32 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2)); /* Use a single polynomial approximation P for both intervals. */ + float64x2_t z3 = vmulq_f64 (z2, z); float64x2_t z4 = vmulq_f64 (z2, z2); float64x2_t z8 = vmulq_f64 (z4, z4); - float64x2_t z16 = vmulq_f64 (z8, z8); - float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); - /* Finalize polynomial: z + z * z2 * P(z2). */ - p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); + /* Order-11 Estrin. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, z4, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, z4, p67); + + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); + + float64x2_t p411 = vfmaq_f64 (p47, z8, p811); + float64x2_t p = vfmaq_f64 (p03, z8, p411); + + /* Finalize polynomial: z + z3 * P(z2). */ + p = vfmaq_f64 (z, z3, p); /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 = 2 Q(|x|) , for 0.5 < x < 1.0 diff --git a/sysdeps/aarch64/fpu/acos_sve.c b/sysdeps/aarch64/fpu/acos_sve.c index 74e2f7df0..104f0d780 100644 --- a/sysdeps/aarch64/fpu/acos_sve.c +++ b/sysdeps/aarch64/fpu/acos_sve.c @@ -18,20 +18,21 @@ . */ #include "sv_math.h" -#include "poly_sve_f64.h" static const struct data { - float64_t poly[12]; - float64_t pi, pi_over_2; + float64_t c1, c3, c5, c7, c9, c11; + float64_t c0, c2, c4, c6, c8, c10; + float64_t pi_over_2; } data = { /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ - .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5, - 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, - 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8, - 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, }, - .pi = 0x1.921fb54442d18p+1, + .c0 = 0x1.555555555554ep-3, .c1 = 0x1.3333333337233p-4, + .c2 = 0x1.6db6db67f6d9fp-5, .c3 = 0x1.f1c71fbd29fbbp-6, + .c4 = 0x1.6e8b264d467d6p-6, .c5 = 0x1.1c5997c357e9dp-6, + .c6 = 0x1.c86a22cd9389dp-7, .c7 = 0x1.856073c22ebbep-7, + .c8 = 0x1.fd1151acb6bedp-8, .c9 = 0x1.087182f799c1dp-6, + .c10 = -0x1.6602748120927p-7, .c11 = 0x1.cfa0dd1f9478p-6, .pi_over_2 = 0x1.921fb54442d18p+0, }; @@ -42,20 +43,21 @@ static const struct data acos(x) ~ pi/2 - (x + x^3 P(x^2)). - The largest observed error in this region is 1.18 ulps, - _ZGVsMxv_acos (0x1.fbc5fe28ee9e3p-2) got 0x1.0d4d0f55667f6p+0 - want 0x1.0d4d0f55667f7p+0. + The largest observed error in this region is 1.18 ulp: + _ZGVsMxv_acos (0x1.fbb7c9079b429p-2) got 0x1.0d51266607582p+0 + want 0x1.0d51266607583p+0. For |x| in [0.5, 1.0], use same approximation with a change of variable acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). - The largest observed error in this region is 1.52 ulps, - _ZGVsMxv_acos (0x1.24024271a500ap-1) got 0x1.ed82df4243f0dp-1 - want 0x1.ed82df4243f0bp-1. */ + The largest observed error in this region is 1.50 ulp: + _ZGVsMxv_acos (0x1.252a2cf3fb9acp-1) got 0x1.ec1a46aa82901p-1 + want 0x1.ec1a46aa829p-1. */ svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); + svbool_t ptrue = svptrue_b64 (); svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); svfloat64_t ax = svabs_x (pg, x); @@ -70,24 +72,41 @@ svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg) svfloat64_t z = svsqrt_m (ax, a_gt_half, z2); /* Use a single polynomial approximation P for both intervals. */ - svfloat64_t z4 = svmul_x (pg, z2, z2); - svfloat64_t z8 = svmul_x (pg, z4, z4); - svfloat64_t z16 = svmul_x (pg, z8, z8); - svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly); + svfloat64_t z3 = svmul_x (ptrue, z2, z); + svfloat64_t z4 = svmul_x (ptrue, z2, z2); + svfloat64_t z8 = svmul_x (ptrue, z4, z4); + + svfloat64_t c13 = svld1rq (ptrue, &d->c1); + svfloat64_t c57 = svld1rq (ptrue, &d->c5); + svfloat64_t c911 = svld1rq (ptrue, &d->c9); + + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1); + svfloat64_t p03 = svmla_x (pg, p01, z4, p23); + + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1); + svfloat64_t p47 = svmla_x (pg, p45, z4, p67); + + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1); + svfloat64_t p811 = svmla_x (pg, p89, z4, p1011); + + svfloat64_t p411 = svmla_x (pg, p47, z8, p811); + svfloat64_t p = svmad_x (pg, p411, z8, p03); /* Finalize polynomial: z + z * z2 * P(z2). */ - p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + p = svmad_x (pg, p, z3, z); /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 = 2 Q(|x|) , for 0.5 < x < 1.0 = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ - svfloat64_t y - = svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (p), sign)); - - svbool_t is_neg = svcmplt (pg, x, 0.0); - svfloat64_t off = svdup_f64_z (is_neg, d->pi); - svfloat64_t mul = svsel (a_gt_half, sv_f64 (2.0), sv_f64 (-1.0)); - svfloat64_t add = svsel (a_gt_half, off, sv_f64 (d->pi_over_2)); - - return svmla_x (pg, add, mul, y); + svfloat64_t mul = svreinterpret_f64 ( + svlsl_m (a_gt_half, svreinterpret_u64 (sv_f64 (1.0)), 10)); + mul = svreinterpret_f64 (sveor_x (ptrue, svreinterpret_u64 (mul), sign)); + svfloat64_t add = svreinterpret_f64 ( + svorr_x (ptrue, sign, svreinterpret_u64 (sv_f64 (d->pi_over_2)))); + add = svsub_m (a_gt_half, sv_f64 (d->pi_over_2), add); + + return svmsb_x (pg, p, mul, add); } diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c index 326b2cca2..3a84959f0 100644 --- a/sysdeps/aarch64/fpu/acosh_sve.c +++ b/sysdeps/aarch64/fpu/acosh_sve.c @@ -30,10 +30,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special) } /* SVE approximation for double-precision acosh, based on log1p. - The largest observed error is 3.19 ULP in the region where the + The largest observed error is 3.14 ULP in the region where the argument to log1p falls in the k=0 interval, i.e. x close to 1: - SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2 - want 0x1.ed23399f51373p-2. */ + SV_NAME_D1 (acosh)(0x1.1e80ed12f0ad1p+0) got 0x1.ef0cee7c33ce1p-2 + want 0x1.ef0cee7c33ce4p-2. */ svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg) { /* (ix - One) >= (BigBound - One). */ diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c index 414211627..f74141c84 100644 --- a/sysdeps/aarch64/fpu/asin_advsimd.c +++ b/sysdeps/aarch64/fpu/asin_advsimd.c @@ -18,24 +18,23 @@ . */ #include "v_math.h" -#include "poly_advsimd_f64.h" static const struct data { - float64x2_t poly[12]; + float64x2_t c0, c2, c4, c6, c8, c10; float64x2_t pi_over_2; uint64x2_t abs_mask; + double c1, c3, c5, c7, c9, c11; } data = { /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ - .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), - V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), - V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), - V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), - V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), - V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, - .pi_over_2 = V2 (0x1.921fb54442d18p+0), - .abs_mask = V2 (0x7fffffffffffffff), + .c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4, + .c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6, + .c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6, + .c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7, + .c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6, + .c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff), }; #define AllMask v_u64 (0xffffffffffffffff) @@ -68,8 +67,8 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special) asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). The largest observed error in this region is 2.69 ulps, - _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 - want 0x1.110d7e85fdd53p-1. */ + _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1 + want 0x1.1111dd54ddf99p-1. */ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -86,7 +85,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) return special_case (x, x, AllMask); #endif - uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5)); + uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5)); /* Evaluate polynomial Q(x) = y + y * z * P(z) with z = x ^ 2 and y = |x| , if |x| < 0.5 @@ -99,7 +98,26 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) float64x2_t z4 = vmulq_f64 (z2, z2); float64x2_t z8 = vmulq_f64 (z4, z4); float64x2_t z16 = vmulq_f64 (z8, z8); - float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); + + /* order-11 estrin. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, z4, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, z4, p67); + + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); + + float64x2_t p07 = vfmaq_f64 (p03, z8, p47); + float64x2_t p = vfmaq_f64 (p07, z16, p811); /* Finalize polynomial: z + z * z2 * P(z2). */ p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); diff --git a/sysdeps/aarch64/fpu/asin_sve.c b/sysdeps/aarch64/fpu/asin_sve.c index 9314466f5..975f408be 100644 --- a/sysdeps/aarch64/fpu/asin_sve.c +++ b/sysdeps/aarch64/fpu/asin_sve.c @@ -18,45 +18,43 @@ . */ #include "sv_math.h" -#include "poly_sve_f64.h" static const struct data { - float64_t poly[12]; - float64_t pi_over_2f; + float64_t c1, c3, c5, c7, c9, c11; + float64_t c0, c2, c4, c6, c8, c10; + float64_t pi_over_2; } data = { /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ - .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, - 0x1.6db6db67f6d9fp-5, 0x1.f1c71fbd29fbbp-6, - 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, - 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, - 0x1.fd1151acb6bedp-8, 0x1.087182f799c1dp-6, - -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, }, - .pi_over_2f = 0x1.921fb54442d18p+0, + .c0 = 0x1.555555555554ep-3, .c1 = 0x1.3333333337233p-4, + .c2 = 0x1.6db6db67f6d9fp-5, .c3 = 0x1.f1c71fbd29fbbp-6, + .c4 = 0x1.6e8b264d467d6p-6, .c5 = 0x1.1c5997c357e9dp-6, + .c6 = 0x1.c86a22cd9389dp-7, .c7 = 0x1.856073c22ebbep-7, + .c8 = 0x1.fd1151acb6bedp-8, .c9 = 0x1.087182f799c1dp-6, + .c10 = -0x1.6602748120927p-7, .c11 = 0x1.cfa0dd1f9478p-6, + .pi_over_2 = 0x1.921fb54442d18p+0, }; -#define P(i) sv_f64 (d->poly[i]) - /* Double-precision SVE implementation of vector asin(x). For |x| in [0, 0.5], use an order 11 polynomial P such that the final approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). - The largest observed error in this region is 0.52 ulps, - _ZGVsMxv_asin(0x1.d95ae04998b6cp-2) got 0x1.ec13757305f27p-2 - want 0x1.ec13757305f26p-2. - - For |x| in [0.5, 1.0], use same approximation with a change of variable + The largest observed error in this region is 0.98 ulp: + _ZGVsMxv_asin (0x1.d98f6a748ed8ap-2) got 0x1.ec4eb661a73d3p-2 + want 0x1.ec4eb661a73d2p-2. - asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + For |x| in [0.5, 1.0], use same approximation with a change of variable: + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). - The largest observed error in this region is 2.69 ulps, - _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 - want 0x1.110d7e85fdd53p-1. */ + The largest observed error in this region is 2.66 ulp: + _ZGVsMxv_asin (0x1.04024f6e2a2fbp-1) got 0x1.10b9586f087a8p-1 + want 0x1.10b9586f087abp-1. */ svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); + svbool_t ptrue = svptrue_b64 (); svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); svfloat64_t ax = svabs_x (pg, x); @@ -70,17 +68,37 @@ svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg) svfloat64_t z = svsqrt_m (ax, a_ge_half, z2); /* Use a single polynomial approximation P for both intervals. */ + svfloat64_t z3 = svmul_x (pg, z2, z); svfloat64_t z4 = svmul_x (pg, z2, z2); svfloat64_t z8 = svmul_x (pg, z4, z4); - svfloat64_t z16 = svmul_x (pg, z8, z8); - svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly); + + svfloat64_t c13 = svld1rq (ptrue, &d->c1); + svfloat64_t c57 = svld1rq (ptrue, &d->c5); + svfloat64_t c911 = svld1rq (ptrue, &d->c9); + + /* Order-11 Estrin scheme. */ + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1); + svfloat64_t p03 = svmla_x (pg, p01, z4, p23); + + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1); + svfloat64_t p47 = svmla_x (pg, p45, z4, p67); + + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1); + svfloat64_t p811 = svmla_x (pg, p89, z4, p1011); + + svfloat64_t p411 = svmla_x (pg, p47, z8, p811); + svfloat64_t p = svmla_x (pg, p03, z8, p411); + /* Finalize polynomial: z + z * z2 * P(z2). */ - p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + p = svmla_x (pg, z, z3, p); - /* asin(|x|) = Q(|x|) , for |x| < 0.5 - = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ - svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2f); + /* asin(|x|) = Q(|x|), for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2); - /* Copy sign. */ + /* Reinsert the sign from the argument. */ return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); } diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c index 52c7c0ec6..013936c2c 100644 --- a/sysdeps/aarch64/fpu/asinf_advsimd.c +++ b/sysdeps/aarch64/fpu/asinf_advsimd.c @@ -18,22 +18,21 @@ . */ #include "v_math.h" -#include "poly_advsimd_f32.h" static const struct data { - float32x4_t poly[5]; + float32x4_t c0, c2, c4; + float c1, c3; float32x4_t pi_over_2f; } data = { /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ - .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5), - V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) }, - .pi_over_2f = V4 (0x1.921fb6p+0f), + .c0 = V4 (0x1.55555ep-3f), .c1 = 0x1.33261ap-4f, + .c2 = V4 (0x1.70d7dcp-5f), .c3 = 0x1.b059dp-6f, + .c4 = V4 (0x1.3af7d8p-5f), .pi_over_2f = V4 (0x1.921fb6p+0f), }; #define AbsMask 0x7fffffff -#define Half 0x3f000000 #define One 0x3f800000 #define Small 0x39800000 /* 2^-12. */ @@ -47,11 +46,8 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special) /* Single-precision implementation of vector asin(x). - For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct - rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the - following approximation. - For |x| in [Small, 0.5], use order 4 polynomial P such that the final + For |x| <0.5, use order 4 polynomial P such that the final approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). The largest observed error in this region is 0.83 ulps, @@ -80,24 +76,31 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x) #endif float32x4_t ax = vreinterpretq_f32_u32 (ia); - uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half)); + uint32x4_t a_lt_half = vcaltq_f32 (x, v_f32 (0.5f)); /* Evaluate polynomial Q(x) = y + y * z * P(z) with z = x ^ 2 and y = |x| , if |x| < 0.5 z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x), - vfmsq_n_f32 (v_f32 (0.5), ax, 0.5)); + vfmsq_n_f32 (v_f32 (0.5f), ax, 0.5f)); float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2)); /* Use a single polynomial approximation P for both intervals. */ - float32x4_t p = v_horner_4_f32 (z2, d->poly); + + /* PW Horner 3 evaluation scheme. */ + float32x4_t z4 = vmulq_f32 (z2, z2); + float32x4_t c13 = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c13, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c13, 1); + float32x4_t p = vfmaq_f32 (p23, d->c4, z4); + p = vfmaq_f32 (p01, p, z4); /* Finalize polynomial: z + z * z2 * P(z2). */ p = vfmaq_f32 (z, vmulq_f32 (z, z2), p); /* asin(|x|) = Q(|x|) , for |x| < 0.5 = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ float32x4_t y - = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0)); + = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0f)); /* Copy sign. */ return vbslq_f32 (v_u32 (AbsMask), y, x); diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c index 0889f79db..ff6b71390 100644 --- a/sysdeps/aarch64/fpu/asinh_sve.c +++ b/sysdeps/aarch64/fpu/asinh_sve.c @@ -18,36 +18,49 @@ . */ #include "sv_math.h" -#include "poly_sve_f64.h" #define SignMask (0x8000000000000000) #define One (0x3ff0000000000000) #define Thres (0x5fe0000000000000) /* asuint64 (0x1p511). */ +#define IndexMask (((1 << V_LOG_TABLE_BITS) - 1) << 1) static const struct data { - double poly[18]; - double ln2, p3, p1, p4, p0, p2; - uint64_t n; - uint64_t off; + double even_coeffs[9]; + double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17; + uint64_t off, mask; } data = { - /* Polynomial generated using Remez on [2^-26, 1]. */ - .poly - = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5, - 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6, - -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7, - 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8, - -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11, - 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18 }, + /* Polynomial generated using Remez on [2^-26, 1]. */ + .even_coeffs ={ + -0x1.55555555554a7p-3, + -0x1.6db6db68332e6p-5, + -0x1.6e8b8b654a621p-6, + -0x1.c9871d10885afp-7, + -0x1.3ddca533e9f54p-7, + -0x1.b90c7099dd397p-8, + -0x1.d217026a669ecp-9, + -0x1.e0f37daef9127p-11, + -0x1.021a48685e287p-14, }, + + .c1 = 0x1.3333333326c7p-4, + .c3 = 0x1.f1c71b26fb40dp-6, + .c5 = 0x1.1c4daa9e67871p-6, + .c7 = 0x1.7a16e8d9d2ecfp-7, + .c9 = 0x1.0becef748dafcp-7, + .c11 = 0x1.541f2bb1ffe51p-8, + .c13 = 0x1.0b5c7977aaf7p-9, + .c15 = 0x1.388b5fe542a6p-12, + .c17 = 0x1.93d4ba83d34dap-18, + .ln2 = 0x1.62e42fefa39efp-1, .p0 = -0x1.ffffffffffff7p-2, .p1 = 0x1.55555555170d4p-2, .p2 = -0x1.0000000399c27p-2, .p3 = 0x1.999b2e90e94cap-3, .p4 = -0x1.554e550bd501ep-3, - .n = 1 << V_LOG_TABLE_BITS, - .off = 0x3fe6900900000000 + .off = 0x3fe6900900000000, + .mask = 0xfffULL << 52, }; static svfloat64_t NOINLINE @@ -64,11 +77,10 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg) of the algorithm used. */ svuint64_t ix = svreinterpret_u64 (x); - svuint64_t tmp = svsub_x (pg, ix, d->off); - svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), - (d->n - 1) << 1); - svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); - svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); + svuint64_t i_off = svsub_x (pg, ix, d->off); + svuint64_t i + = svand_x (pg, svlsr_x (pg, i_off, (51 - V_LOG_TABLE_BITS)), IndexMask); + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, i_off, d->mask)); svfloat64_t z = svreinterpret_f64 (iz); svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); @@ -78,14 +90,14 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg) svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1); svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z); - svfloat64_t kd = svcvt_f64_x (pg, k); + svfloat64_t kd + = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (i_off), 52)); svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0); - svfloat64_t r2 = svmul_x (pg, r, r); - + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1); - svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0); + y = svmla_lane (y, r2, p1_p4, 1); y = svmla_x (pg, p, r2, y); y = svmla_x (pg, hi, r2, y); @@ -111,7 +123,6 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) svuint64_t iax = svbic_x (pg, ix, SignMask); svuint64_t sign = svand_x (pg, ix, SignMask); svfloat64_t ax = svreinterpret_f64 (iax); - svbool_t ge1 = svcmpge (pg, iax, One); svbool_t special = svcmpge (pg, iax, Thres); @@ -120,7 +131,7 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) svfloat64_t option_1 = sv_f64 (0); if (__glibc_likely (svptest_any (pg, ge1))) { - svfloat64_t x2 = svmul_x (pg, ax, ax); + svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax); option_1 = __sv_log_inline ( svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg); } @@ -130,21 +141,53 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) The largest observed error in this region is 1.51 ULPs: _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1 want 0x1.c1e649ee2681dp-1. */ + svfloat64_t option_2 = sv_f64 (0); if (__glibc_likely (svptest_any (pg, svnot_z (pg, ge1)))) { - svfloat64_t x2 = svmul_x (pg, ax, ax); - svfloat64_t x4 = svmul_x (pg, x2, x2); - svfloat64_t p = sv_pw_horner_17_f64_x (pg, x2, x4, d->poly); - option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax)); + svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax); + svfloat64_t x4 = svmul_x (svptrue_b64 (), x2, x2); + /* Order-17 Pairwise Horner scheme. */ + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); + svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5); + svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9); + svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13); + + svfloat64_t p01 = svmla_lane (sv_f64 (d->even_coeffs[0]), x2, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->even_coeffs[1]), x2, c13, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->even_coeffs[2]), x2, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->even_coeffs[3]), x2, c57, 1); + svfloat64_t p89 = svmla_lane (sv_f64 (d->even_coeffs[4]), x2, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->even_coeffs[5]), x2, c911, 1); + svfloat64_t p1213 + = svmla_lane (sv_f64 (d->even_coeffs[6]), x2, c1315, 0); + svfloat64_t p1415 + = svmla_lane (sv_f64 (d->even_coeffs[7]), x2, c1315, 1); + svfloat64_t p1617 = svmla_x (pg, sv_f64 (d->even_coeffs[8]), x2, d->c17); + + svfloat64_t p = svmla_x (pg, p1415, x4, p1617); + p = svmla_x (pg, p1213, x4, p); + p = svmla_x (pg, p1011, x4, p); + p = svmla_x (pg, p89, x4, p); + + p = svmla_x (pg, p67, x4, p); + p = svmla_x (pg, p45, x4, p); + + p = svmla_x (pg, p23, x4, p); + + p = svmla_x (pg, p01, x4, p); + + option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax)); } - /* Choose the right option for each lane. */ - svfloat64_t y = svsel (ge1, option_1, option_2); - if (__glibc_unlikely (svptest_any (pg, special))) return special_case ( - x, svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)), + x, + svreinterpret_f64 (sveor_x ( + pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)), special); + + /* Choose the right option for each lane. */ + svfloat64_t y = svsel (ge1, option_1, option_2); return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); } diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c index 00b4a4f08..a31d52f3a 100644 --- a/sysdeps/aarch64/fpu/atan2_advsimd.c +++ b/sysdeps/aarch64/fpu/atan2_advsimd.c @@ -19,40 +19,38 @@ #include "math_config.h" #include "v_math.h" -#include "poly_advsimd_f64.h" static const struct data { + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; float64x2_t pi_over_2; - double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; - uint64x2_t zeroinfnan, minustwo; + uint64x2_t zeroinfnan; } data = { - /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on - [2**-1022, 1.0]. */ - .c0 = V2 (-0x1.5555555555555p-2), - .c1 = 0x1.99999999996c1p-3, - .c2 = V2 (-0x1.2492492478f88p-3), - .c3 = 0x1.c71c71bc3951cp-4, - .c4 = V2 (-0x1.745d160a7e368p-4), - .c5 = 0x1.3b139b6a88ba1p-4, - .c6 = V2 (-0x1.11100ee084227p-4), - .c7 = 0x1.e1d0f9696f63bp-5, - .c8 = V2 (-0x1.aebfe7b418581p-5), - .c9 = 0x1.842dbe9b0d916p-5, - .c10 = V2 (-0x1.5d30140ae5e99p-5), - .c11 = 0x1.338e31eb2fbbcp-5, - .c12 = V2 (-0x1.00e6eece7de8p-5), - .c13 = 0x1.860897b29e5efp-6, - .c14 = V2 (-0x1.0051381722a59p-6), - .c15 = 0x1.14e9dc19a4a4ep-7, - .c16 = V2 (-0x1.d0062b42fe3bfp-9), - .c17 = 0x1.17739e210171ap-10, - .c18 = V2 (-0x1.ab24da7be7402p-13), - .c19 = 0x1.358851160a528p-16, + /* Coefficients of polynomial P such that + atan(x)~x+x*P(x^2) on [2^-1022, 1.0]. */ + .c0 = V2 (-0x1.555555555552ap-2), + .c1 = 0x1.9999999995aebp-3, + .c2 = V2 (-0x1.24924923923f6p-3), + .c3 = 0x1.c71c7184288a2p-4, + .c4 = V2 (-0x1.745d11fb3d32bp-4), + .c5 = 0x1.3b136a18051b9p-4, + .c6 = V2 (-0x1.110e6d985f496p-4), + .c7 = 0x1.e1bcf7f08801dp-5, + .c8 = V2 (-0x1.ae644e28058c3p-5), + .c9 = 0x1.82eeb1fed85c6p-5, + .c10 = V2 (-0x1.59d7f901566cbp-5), + .c11 = 0x1.2c982855ab069p-5, + .c12 = V2 (-0x1.eb49592998177p-6), + .c13 = 0x1.69d8b396e3d38p-6, + .c14 = V2 (-0x1.ca980345c4204p-7), + .c15 = 0x1.dc050eafde0b3p-8, + .c16 = V2 (-0x1.7ea70755b8eccp-9), + .c17 = 0x1.ba3da3de903e8p-11, + .c18 = V2 (-0x1.44a4b059b6f67p-13), + .c19 = 0x1.c4a45029e5a91p-17, .pi_over_2 = V2 (0x1.921fb54442d18p+0), .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1), - .minustwo = V2 (0xc000000000000000), }; #define SignMask v_u64 (0x8000000000000000) @@ -77,10 +75,9 @@ zeroinfnan (uint64x2_t i, const struct data *d) } /* Fast implementation of vector atan2. - Maximum observed error is 2.8 ulps: - _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5) - got 0x1.92d628ab678ccp-1 - want 0x1.92d628ab678cfp-1. */ + Maximum observed error is 1.97 ulps: + _ZGVnN2vv_atan2 (0x1.42337dba73768p+5, 0x1.422d748cd3e29p+5) + got 0x1.9224810264efcp-1 want 0x1.9224810264efep-1. */ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -101,26 +98,29 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) uint64x2_t pred_xlt0 = vcltzq_f64 (x); uint64x2_t pred_aygtax = vcagtq_f64 (y, x); - /* Set up z for call to atan. */ - float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); - float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax); - float64x2_t z = vdivq_f64 (n, q); - - /* Work out the correct shift. */ - float64x2_t shift - = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo)); - shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); - shift = vmulq_f64 (shift, d->pi_over_2); - - /* Calculate the polynomial approximation. - Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of - full scheme to avoid underflow in x^16. - The order 19 polynomial P approximates - (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + /* Set up z for evaluation of atan. */ + float64x2_t num = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); + float64x2_t den = vbslq_f64 (pred_aygtax, ay, ax); + float64x2_t z = vdivq_f64 (num, den); + + /* Work out the correct shift for atan2: + Multiplication by pi is done later. + -pi when x < 0 and ax < ay + -pi/2 when x < 0 and ax > ay + 0 when x >= 0 and ax < ay + pi/2 when x >= 0 and ax > ay. */ + float64x2_t shift = vreinterpretq_f64_u64 ( + vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0)))); + float64x2_t shift2 = vreinterpretq_f64_u64 ( + vandq_u64 (pred_aygtax, vreinterpretq_u64_f64 (v_f64 (1.0)))); + shift = vaddq_f64 (shift, shift2); + + /* Calculate the polynomial approximation. */ float64x2_t z2 = vmulq_f64 (z, z); - float64x2_t x2 = vmulq_f64 (z2, z2); - float64x2_t x4 = vmulq_f64 (x2, x2); - float64x2_t x8 = vmulq_f64 (x4, x4); + float64x2_t z3 = vmulq_f64 (z2, z); + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); float64x2_t c13 = vld1q_f64 (&d->c1); float64x2_t c57 = vld1q_f64 (&d->c5); @@ -128,45 +128,43 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) float64x2_t c1315 = vld1q_f64 (&d->c13); float64x2_t c1719 = vld1q_f64 (&d->c17); - /* estrin_7. */ + /* Order-7 Estrin. */ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); - float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + float64x2_t p03 = vfmaq_f64 (p01, z4, p23); float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); - float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + float64x2_t p47 = vfmaq_f64 (p45, z4, p67); - float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + float64x2_t p07 = vfmaq_f64 (p03, z8, p47); - /* estrin_11. */ + /* Order-11 Estrin. */ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); - float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); - float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + float64x2_t p1215 = vfmaq_f64 (p1213, z4, p1415); float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); - float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + float64x2_t p1619 = vfmaq_f64 (p1617, z4, p1819); - float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); - float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + float64x2_t p815 = vfmaq_f64 (p811, z8, p1215); + float64x2_t p819 = vfmaq_f64 (p815, z16, p1619); - float64x2_t ret = vfmaq_f64 (p07, p819, x8); + float64x2_t poly = vfmaq_f64 (p07, p819, z16); /* Finalize. y = shift + z + z^3 * P(z^2). */ - ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); - ret = vaddq_f64 (ret, shift); + float64x2_t ret = vfmaq_f64 (z, shift, d->pi_over_2); + ret = vfmaq_f64 (ret, z3, poly); if (__glibc_unlikely (v_any_u64 (special_cases))) return special_case (y, x, ret, sign_xy, special_cases); /* Account for the sign of x and y. */ - ret = vreinterpretq_f64_u64 ( + return vreinterpretq_f64_u64 ( veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); - - return ret; } diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c index 163f61308..9e2dd249d 100644 --- a/sysdeps/aarch64/fpu/atan2_sve.c +++ b/sysdeps/aarch64/fpu/atan2_sve.c @@ -19,25 +19,25 @@ #include "math_config.h" #include "sv_math.h" -#include "poly_sve_f64.h" static const struct data { - float64_t poly[20]; - float64_t pi_over_2; + float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; + float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-1022, 1.0]. */ - .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, - 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, - -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, - 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, - -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, - 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, - -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, }, - .pi_over_2 = 0x1.921fb54442d18p+0, + .c0 = -0x1.555555555552ap-2, .c1 = 0x1.9999999995aebp-3, + .c2 = -0x1.24924923923f6p-3, .c3 = 0x1.c71c7184288a2p-4, + .c4 = -0x1.745d11fb3d32bp-4, .c5 = 0x1.3b136a18051b9p-4, + .c6 = -0x1.110e6d985f496p-4, .c7 = 0x1.e1bcf7f08801dp-5, + .c8 = -0x1.ae644e28058c3p-5, .c9 = 0x1.82eeb1fed85c6p-5, + .c10 = -0x1.59d7f901566cbp-5, .c11 = 0x1.2c982855ab069p-5, + .c12 = -0x1.eb49592998177p-6, .c13 = 0x1.69d8b396e3d38p-6, + .c14 = -0x1.ca980345c4204p-7, .c15 = 0x1.dc050eafde0b3p-8, + .c16 = -0x1.7ea70755b8eccp-9, .c17 = 0x1.ba3da3de903e8p-11, + .c18 = -0x1.44a4b059b6f67p-13, .c19 = 0x1.c4a45029e5a91p-17, }; - /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ static svfloat64_t NOINLINE special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret, @@ -56,15 +56,17 @@ zeroinfnan (svuint64_t i, const svbool_t pg) } /* Fast implementation of SVE atan2. Errors are greatest when y and - x are reasonably close together. The greatest observed error is 2.28 ULP: - _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732) - got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */ -svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) + x are reasonably close together. The greatest observed error is 1.94 ULP: + _ZGVsMxvv_atan2 (0x1.8a4bf7167228ap+5, 0x1.84971226bb57bp+5) + got 0x1.95db19dfef9ccp-1 want 0x1.95db19dfef9cep-1. */ +svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, + const svbool_t pg) { - const struct data *data_ptr = ptr_barrier (&data); + const struct data *d = ptr_barrier (&data); svuint64_t ix = svreinterpret_u64 (x); svuint64_t iy = svreinterpret_u64 (y); + svbool_t ptrue = svptrue_b64 (); svbool_t cmp_x = zeroinfnan (ix, pg); svbool_t cmp_y = zeroinfnan (iy, pg); @@ -81,32 +83,67 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) svbool_t pred_aygtax = svcmpgt (pg, ay, ax); - /* Set up z for call to atan. */ - svfloat64_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay); - svfloat64_t d = svsel (pred_aygtax, ay, ax); - svfloat64_t z = svdiv_x (pg, n, d); - - /* Work out the correct shift. */ + /* Set up z for evaluation of atan. */ + svfloat64_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay); + svfloat64_t den = svsel (pred_aygtax, ay, ax); + svfloat64_t z = svdiv_x (pg, num, den); + + /* Work out the correct shift for atan2: + Multiplication by pi is done later. + -pi when x < 0 and ax < ay + -pi/2 when x < 0 and ax > ay + 0 when x >= 0 and ax < ay + pi/2 when x >= 0 and ax > ay. */ svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1)); + svfloat64_t shift_mul = svreinterpret_f64 ( + svorr_x (pg, sign_x, svreinterpret_u64 (sv_f64 (0x1.921fb54442d18p+0)))); shift = svsel (pred_aygtax, sv_f64 (1.0), shift); - shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift))); - shift = svmul_x (pg, shift, data_ptr->pi_over_2); + shift = svmla_x (pg, z, shift, shift_mul); /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ svfloat64_t z2 = svmul_x (pg, z, z); - svfloat64_t x2 = svmul_x (pg, z2, z2); - svfloat64_t x4 = svmul_x (pg, x2, x2); - svfloat64_t x8 = svmul_x (pg, x4, x4); + svfloat64_t z3 = svmul_x (pg, z2, z); + svfloat64_t z4 = svmul_x (pg, z2, z2); + svfloat64_t z8 = svmul_x (pg, z4, z4); + svfloat64_t z16 = svmul_x (pg, z8, z8); - svfloat64_t ret = svmla_x ( - pg, sv_estrin_7_f64_x (pg, z2, x2, x4, data_ptr->poly), - sv_estrin_11_f64_x (pg, z2, x2, x4, x8, data_ptr->poly + 8), x8); + /* Order-7 Estrin. */ + svfloat64_t c13 = svld1rq (ptrue, &d->c1); + svfloat64_t c57 = svld1rq (ptrue, &d->c5); - /* y = shift + z + z^3 * P(z^2). */ - svfloat64_t z3 = svmul_x (pg, z2, z); - ret = svmla_x (pg, z, z3, ret); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1); + + svfloat64_t p03 = svmla_x (pg, p01, z4, p23); + svfloat64_t p47 = svmla_x (pg, p45, z4, p67); + svfloat64_t p07 = svmla_x (pg, p03, z8, p47); + + /* Order-11 Estrin. */ + svfloat64_t c911 = svld1rq (ptrue, &d->c9); + svfloat64_t c1315 = svld1rq (ptrue, &d->c13); + svfloat64_t c1719 = svld1rq (ptrue, &d->c17); - ret = svadd_m (pg, ret, shift); + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1); + svfloat64_t p811 = svmla_x (pg, p89, z4, p1011); + + svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), z2, c1315, 0); + svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), z2, c1315, 1); + svfloat64_t p1215 = svmla_x (pg, p1213, z4, p1415); + + svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), z2, c1719, 0); + svfloat64_t p1819 = svmla_lane (sv_f64 (d->c18), z2, c1719, 1); + svfloat64_t p1619 = svmla_x (pg, p1617, z4, p1819); + + svfloat64_t p815 = svmla_x (pg, p811, z8, p1215); + svfloat64_t p819 = svmla_x (pg, p815, z16, p1619); + + svfloat64_t poly = svmla_x (pg, p07, z16, p819); + + /* y = shift + z + z^3 * P(z^2). */ + svfloat64_t ret = svmla_x (pg, shift, z3, poly); /* Account for the sign of x and y. */ if (__glibc_unlikely (svptest_any (pg, cmp_xy))) diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c index e65406f49..75d873897 100644 --- a/sysdeps/aarch64/fpu/atan2f_advsimd.c +++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c @@ -18,22 +18,22 @@ . */ #include "v_math.h" -#include "poly_advsimd_f32.h" static const struct data { - float32x4_t c0, pi_over_2, c4, c6, c2; + float32x4_t c0, c4, c6, c2; float c1, c3, c5, c7; uint32x4_t comp_const; + float32x4_t pi; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. Generated using fpminimax between FLT_MIN and 1. */ - .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f, - .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f, - .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f, - .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f, - .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1), + .c0 = V4 (-0x1.5554dcp-2), .c1 = 0x1.9978ecp-3, + .c2 = V4 (-0x1.230a94p-3), .c3 = 0x1.b4debp-4, + .c4 = V4 (-0x1.3550dap-4), .c5 = 0x1.61eebp-5, + .c6 = V4 (-0x1.0c17d4p-6), .c7 = 0x1.7ea694p-9, + .pi = V4 (0x1.921fb6p+1f), .comp_const = V4 (2 * 0x7f800000lu - 1), }; #define SignMask v_u32 (0x80000000) @@ -54,13 +54,13 @@ static inline uint32x4_t zeroinfnan (uint32x4_t i, const struct data *d) { /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ - return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const); + return vcgeq_u32 (vsubq_u32 (vshlq_n_u32 (i, 1), v_u32 (1)), d->comp_const); } /* Fast implementation of vector atan2f. Maximum observed error is - 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: - _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 - want 0x1.967f00p-1. */ + 2.13 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: + _ZGVnN4vv_atan2f (0x1.14a9d4p-87, 0x1.0eb886p-87) got 0x1.97aea2p-1 + want 0x1.97ae9ep-1. */ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) { const struct data *d = ptr_barrier (&data); @@ -81,28 +81,31 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) uint32x4_t pred_xlt0 = vcltzq_f32 (x); uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax); - /* Set up z for call to atanf. */ - float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); - float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax); - float32x4_t z = vdivq_f32 (n, q); - - /* Work out the correct shift. */ + /* Set up z for evaluation of atanf. */ + float32x4_t num = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); + float32x4_t den = vbslq_f32 (pred_aygtax, ay, ax); + float32x4_t z = vdivq_f32 (num, den); + + /* Work out the correct shift for atan2: + Multiplication by pi is done later. + -pi when x < 0 and ax < ay + -pi/2 when x < 0 and ax > ay + 0 when x >= 0 and ax < ay + pi/2 when x >= 0 and ax > ay. */ float32x4_t shift = vreinterpretq_f32_u32 ( - vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); - shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); - shift = vmulq_f32 (shift, d->pi_over_2); - - /* Calculate the polynomial approximation. - Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, - a standard implementation using z8 creates spurious underflow - in the very last fma (when z^8 is small enough). - Therefore, we split the last fma into a mul and an fma. - Horner and single-level Estrin have higher errors that exceed - threshold. */ + vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-1.0f)))); + float32x4_t shift2 = vreinterpretq_f32_u32 ( + vandq_u32 (pred_aygtax, vreinterpretq_u32_f32 (v_f32 (0.5f)))); + shift = vaddq_f32 (shift, shift2); + + /* Calculate the polynomial approximation. */ float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z3 = vmulq_f32 (z2, z); float32x4_t z4 = vmulq_f32 (z2, z2); + float32x4_t z8 = vmulq_f32 (z4, z4); float32x4_t c1357 = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0); float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1); float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2); @@ -110,10 +113,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) float32x4_t p03 = vfmaq_f32 (p01, z4, p23); float32x4_t p47 = vfmaq_f32 (p45, z4, p67); - float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47)); + float32x4_t poly = vfmaq_f32 (p03, z8, p47); /* y = shift + z * P(z^2). */ - ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); + float32x4_t ret = vfmaq_f32 (z, shift, d->pi); + ret = vfmaq_f32 (ret, z3, poly); if (__glibc_unlikely (v_any_u32 (special_cases))) { diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c index 5f26e2a36..4d9341952 100644 --- a/sysdeps/aarch64/fpu/atan2f_sve.c +++ b/sysdeps/aarch64/fpu/atan2f_sve.c @@ -18,18 +18,18 @@ . */ #include "sv_math.h" -#include "poly_sve_f32.h" static const struct data { - float32_t poly[8]; + float32_t c0, c2, c4, c6; + float32_t c1, c3, c5, c7; float32_t pi_over_2; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. */ - .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, - -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, - .pi_over_2 = 0x1.921fb6p+0f, + .c0 = -0x1.5554dcp-2, .c1 = 0x1.9978ecp-3, .c2 = -0x1.230a94p-3, + .c3 = 0x1.b4debp-4, .c4 = -0x1.3550dap-4, .c5 = 0x1.61eebp-5, + .c6 = -0x1.0c17d4p-6, .c7 = 0x1.7ea694p-9, .pi_over_2 = 0x1.921fb6p+0f, }; /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ @@ -51,12 +51,14 @@ zeroinfnan (svuint32_t i, const svbool_t pg) /* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum - observed error is 2.95 ULP: - _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 - want 0x1.967f00p-1. */ -svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) + observed error is 2.21 ULP: + _ZGVnN4vv_atan2f (0x1.a04aa8p+6, 0x1.9a274p+6) got 0x1.95ed3ap-1 + want 0x1.95ed36p-1. */ +svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, + const svbool_t pg) { - const struct data *data_ptr = ptr_barrier (&data); + const struct data *d = ptr_barrier (&data); + svbool_t ptrue = svptrue_b32 (); svuint32_t ix = svreinterpret_u32 (x); svuint32_t iy = svreinterpret_u32 (y); @@ -76,29 +78,42 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) svbool_t pred_aygtax = svcmpgt (pg, ay, ax); - /* Set up z for call to atan. */ - svfloat32_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay); - svfloat32_t d = svsel (pred_aygtax, ay, ax); - svfloat32_t z = svdiv_x (pg, n, d); - - /* Work out the correct shift. */ + /* Set up z for evaluation of atanf. */ + svfloat32_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay); + svfloat32_t den = svsel (pred_aygtax, ay, ax); + svfloat32_t z = svdiv_x (ptrue, num, den); + + /* Work out the correct shift for atan2: + Multiplication by pi is done later. + -pi when x < 0 and ax < ay + -pi/2 when x < 0 and ax > ay + 0 when x >= 0 and ax < ay + pi/2 when x >= 0 and ax > ay. */ svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1)); shift = svsel (pred_aygtax, sv_f32 (1.0), shift); shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift))); - shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2)); /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */ - svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t z2 = svmul_x (ptrue, z, z); + svfloat32_t z3 = svmul_x (pg, z2, z); svfloat32_t z4 = svmul_x (pg, z2, z2); svfloat32_t z8 = svmul_x (pg, z4, z4); - svfloat32_t ret = sv_estrin_7_f32_x (pg, z2, z4, z8, data_ptr->poly); + svfloat32_t odd_coeffs = svld1rq (ptrue, &d->c1); - /* ret = shift + z + z^3 * P(z^2). */ - svfloat32_t z3 = svmul_x (pg, z2, z); - ret = svmla_x (pg, z, z3, ret); + svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); + svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); + svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); + svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), z2, odd_coeffs, 3); - ret = svadd_m (pg, ret, shift); + svfloat32_t p03 = svmla_x (pg, p01, z4, p23); + svfloat32_t p47 = svmla_x (pg, p45, z4, p67); + + svfloat32_t poly = svmla_x (pg, p03, z8, p47); + + /* ret = shift + z + z^3 * P(z^2). */ + svfloat32_t ret = svmla_x (pg, z, shift, sv_f32 (d->pi_over_2)); + ret = svmla_x (pg, ret, z3, poly); /* Account for the sign of x and y. */ diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c index f024fd1d7..da0d3715d 100644 --- a/sysdeps/aarch64/fpu/atan_advsimd.c +++ b/sysdeps/aarch64/fpu/atan_advsimd.c @@ -18,7 +18,6 @@ . */ #include "v_math.h" -#include "poly_advsimd_f64.h" static const struct data { @@ -28,16 +27,16 @@ static const struct data } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-1022, 1.0]. */ - .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3, - .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4, - .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4, - .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5, - .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5, - .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5, - .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6, - .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7, - .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10, - .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16, + .c0 = V2 (-0x1.555555555552ap-2), .c1 = 0x1.9999999995aebp-3, + .c2 = V2 (-0x1.24924923923f6p-3), .c3 = 0x1.c71c7184288a2p-4, + .c4 = V2 (-0x1.745d11fb3d32bp-4), .c5 = 0x1.3b136a18051b9p-4, + .c6 = V2 (-0x1.110e6d985f496p-4), .c7 = 0x1.e1bcf7f08801dp-5, + .c8 = V2 (-0x1.ae644e28058c3p-5), .c9 = 0x1.82eeb1fed85c6p-5, + .c10 = V2 (-0x1.59d7f901566cbp-5), .c11 = 0x1.2c982855ab069p-5, + .c12 = V2 (-0x1.eb49592998177p-6), .c13 = 0x1.69d8b396e3d38p-6, + .c14 = V2 (-0x1.ca980345c4204p-7), .c15 = 0x1.dc050eafde0b3p-8, + .c16 = V2 (-0x1.7ea70755b8eccp-9), .c17 = 0x1.ba3da3de903e8p-11, + .c18 = V2 (-0x1.44a4b059b6f67p-13), .c19 = 0x1.c4a45029e5a91p-17, .pi_over_2 = V2 (0x1.921fb54442d18p+0), }; @@ -47,9 +46,9 @@ static const struct data /* Fast implementation of vector atan. Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using - z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps: - _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 - want 0x1.9225645bdd7c3p-1. */ + z=1/x and shift = pi/2. Maximum observed error is 2.45 ulps: + _ZGVnN2v_atan (0x1.0008d737eb3e6p+0) got 0x1.92288c551a4c1p-1 + want 0x1.92288c551a4c3p-1. */ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -78,59 +77,53 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 Hence, use z=-1/a if x>=1, otherwise z=a. */ - uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0)); + uint64x2_t red = vcagtq_f64 (x, v_f64 (-1.0)); /* Avoid dependency in abs(x) in division (and comparison). */ - float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x); + float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (-1.0), x), x); + float64x2_t shift = vreinterpretq_f64_u64 ( vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2))); - /* Use absolute value only when needed (odd powers of z). */ - float64x2_t az = vbslq_f64 ( - SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z); - - /* Calculate the polynomial approximation. - Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of - full scheme to avoid underflow in x^16. - The order 19 polynomial P approximates - (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + + /* Reinsert sign bit from argument into the shift value. */ + shift = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (shift), sign)); + + /* Calculate polynomial approximation P(z^2) with deg(P)=19. */ float64x2_t z2 = vmulq_f64 (z, z); - float64x2_t x2 = vmulq_f64 (z2, z2); - float64x2_t x4 = vmulq_f64 (x2, x2); - float64x2_t x8 = vmulq_f64 (x4, x4); + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); - /* estrin_7. */ + /* Order-7 Estrin. */ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); - float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + float64x2_t p03 = vfmaq_f64 (p01, z4, p23); float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); - float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + float64x2_t p47 = vfmaq_f64 (p45, z4, p67); - float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + float64x2_t p07 = vfmaq_f64 (p03, z8, p47); - /* estrin_11. */ + /* Order-11 Estrin. */ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); - float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); - float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + float64x2_t p1215 = vfmaq_f64 (p1213, z4, p1415); float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); - float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + float64x2_t p1619 = vfmaq_f64 (p1617, z4, p1819); - float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); - float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + float64x2_t p815 = vfmaq_f64 (p811, z8, p1215); + float64x2_t p819 = vfmaq_f64 (p815, z16, p1619); - float64x2_t y = vfmaq_f64 (p07, p819, x8); + float64x2_t y = vfmaq_f64 (p07, p819, z16); /* Finalize. y = shift + z + z^3 * P(z^2). */ - y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); - y = vaddq_f64 (y, shift); - - /* y = atan(x) if x>0, -atan(-x) otherwise. */ - y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign)); - return y; + y = vfmsq_f64 (v_f64 (-1.0), z2, y); + return vfmsq_f64 (shift, z, y); } diff --git a/sysdeps/aarch64/fpu/atan_sve.c b/sysdeps/aarch64/fpu/atan_sve.c index 3880cedff..a6b0489cf 100644 --- a/sysdeps/aarch64/fpu/atan_sve.c +++ b/sysdeps/aarch64/fpu/atan_sve.c @@ -18,23 +18,26 @@ . */ #include "sv_math.h" -#include "poly_sve_f64.h" static const struct data { - float64_t poly[20]; - float64_t pi_over_2; + float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; + float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; + float64_t shift_val, neg_one; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-1022, 1.0]. */ - .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, - 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, - -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, - 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, - -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, - 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, - -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, }, - .pi_over_2 = 0x1.921fb54442d18p+0, + .c0 = -0x1.555555555552ap-2, .c1 = 0x1.9999999995aebp-3, + .c2 = -0x1.24924923923f6p-3, .c3 = 0x1.c71c7184288a2p-4, + .c4 = -0x1.745d11fb3d32bp-4, .c5 = 0x1.3b136a18051b9p-4, + .c6 = -0x1.110e6d985f496p-4, .c7 = 0x1.e1bcf7f08801dp-5, + .c8 = -0x1.ae644e28058c3p-5, .c9 = 0x1.82eeb1fed85c6p-5, + .c10 = -0x1.59d7f901566cbp-5, .c11 = 0x1.2c982855ab069p-5, + .c12 = -0x1.eb49592998177p-6, .c13 = 0x1.69d8b396e3d38p-6, + .c14 = -0x1.ca980345c4204p-7, .c15 = 0x1.dc050eafde0b3p-8, + .c16 = -0x1.7ea70755b8eccp-9, .c17 = 0x1.ba3da3de903e8p-11, + .c18 = -0x1.44a4b059b6f67p-13, .c19 = 0x1.c4a45029e5a91p-17, + .shift_val = 0x1.490fdaa22168cp+1, .neg_one = -1, }; /* Useful constants. */ @@ -43,15 +46,14 @@ static const struct data /* Fast implementation of SVE atan. Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed - error is 2.27 ulps: - _ZGVsMxv_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 - want 0x1.9225645bdd7c3p-1. */ + error is 2.08 ulps: + _ZGVsMxv_atan (0x1.000a7c56975e8p+0) got 0x1.922a3163e15c2p-1 + want 0x1.922a3163e15c4p-1. */ svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - /* No need to trigger special case. Small cases, infs and nans - are supported by our approximation technique. */ + svbool_t ptrue = svptrue_b64 (); svuint64_t ix = svreinterpret_u64 (x); svuint64_t sign = svand_x (pg, ix, SignMask); @@ -59,32 +61,60 @@ svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg) y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 Hence, use z=-1/a if x>=1, otherwise z=a. */ - svbool_t red = svacgt (pg, x, 1.0); - /* Avoid dependency in abs(x) in division (and comparison). */ - svfloat64_t z = svsel (red, svdivr_x (pg, x, 1.0), x); - /* Use absolute value only when needed (odd powers of z). */ - svfloat64_t az = svabs_x (pg, z); - az = svneg_m (az, red, az); + svbool_t red = svacgt (pg, x, d->neg_one); + svfloat64_t z = svsel (red, svdiv_x (pg, sv_f64 (d->neg_one), x), x); + + /* Reuse of -1.0f to reduce constant loads, + We need a shift value of 1/2, which is created via -1 + (1 + 1/2). */ + svfloat64_t shift + = svadd_z (red, sv_f64 (d->neg_one), sv_f64 (d->shift_val)); + + /* Reinserts the sign bit of the argument to handle the case of x < -1. */ + shift = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (shift), sign)); /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ - svfloat64_t z2 = svmul_x (pg, z, z); - svfloat64_t x2 = svmul_x (pg, z2, z2); - svfloat64_t x4 = svmul_x (pg, x2, x2); - svfloat64_t x8 = svmul_x (pg, x4, x4); + svfloat64_t z2 = svmul_x (ptrue, z, z); + svfloat64_t z4 = svmul_x (ptrue, z2, z2); + svfloat64_t z8 = svmul_x (ptrue, z4, z4); + svfloat64_t z16 = svmul_x (ptrue, z8, z8); - svfloat64_t y - = svmla_x (pg, sv_estrin_7_f64_x (pg, z2, x2, x4, d->poly), - sv_estrin_11_f64_x (pg, z2, x2, x4, x8, d->poly + 8), x8); + /* Order-7 Estrin. */ + svfloat64_t c13 = svld1rq (ptrue, &d->c1); + svfloat64_t c57 = svld1rq (ptrue, &d->c5); - /* y = shift + z + z^3 * P(z^2). */ - svfloat64_t z3 = svmul_x (pg, z2, az); - y = svmla_x (pg, az, z3, y); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1); + + svfloat64_t p03 = svmla_x (pg, p01, z4, p23); + svfloat64_t p47 = svmla_x (pg, p45, z4, p67); + svfloat64_t p07 = svmla_x (pg, p03, z8, p47); + + /* Order-11 Estrin. */ + svfloat64_t c911 = svld1rq (ptrue, &d->c9); + svfloat64_t c1315 = svld1rq (ptrue, &d->c13); + svfloat64_t c1719 = svld1rq (ptrue, &d->c17); - /* Apply shift as indicated by `red` predicate. */ - y = svadd_m (red, y, d->pi_over_2); + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1); + svfloat64_t p811 = svmla_x (pg, p89, z4, p1011); - /* y = atan(x) if x>0, -atan(-x) otherwise. */ - y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); + svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), z2, c1315, 0); + svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), z2, c1315, 1); + svfloat64_t p1215 = svmla_x (pg, p1213, z4, p1415); - return y; + svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), z2, c1719, 0); + svfloat64_t p1819 = svmla_lane (sv_f64 (d->c18), z2, c1719, 1); + svfloat64_t p1619 = svmla_x (pg, p1617, z4, p1819); + + svfloat64_t p815 = svmla_x (pg, p811, z8, p1215); + svfloat64_t p819 = svmla_x (pg, p815, z16, p1619); + + svfloat64_t y = svmla_x (pg, p07, z16, p819); + + /* y = shift + z + z^3 * P(z^2). */ + shift = svadd_m (red, z, shift); + y = svmul_x (pg, z2, y); + return svmla_x (pg, shift, z, y); } diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c index 472865ed7..817a47ef3 100644 --- a/sysdeps/aarch64/fpu/atanf_advsimd.c +++ b/sysdeps/aarch64/fpu/atanf_advsimd.c @@ -22,26 +22,35 @@ static const struct data { + uint32x4_t sign_mask, pi_over_2; + float32x4_t neg_one; +#if WANT_SIMD_EXCEPT float32x4_t poly[8]; - float32x4_t pi_over_2; +} data = { + .poly = { V4 (-0x1.5554dcp-2), V4 (0x1.9978ecp-3), V4 (-0x1.230a94p-3), + V4 (0x1.b4debp-4), V4 (-0x1.3550dap-4), V4 (0x1.61eebp-5), + V4 (-0x1.0c17d4p-6), V4 (0x1.7ea694p-9) }, +#else + float32x4_t c0, c2, c4, c6; + float c1, c3, c5, c7; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. Generated using fpminimax between FLT_MIN and 1. */ - .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), - V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), - V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, - .pi_over_2 = V4 (0x1.921fb6p+0f), + .c0 = V4 (-0x1.5554dcp-2), .c1 = 0x1.9978ecp-3, + .c2 = V4 (-0x1.230a94p-3), .c3 = 0x1.b4debp-4, + .c4 = V4 (-0x1.3550dap-4), .c5 = 0x1.61eebp-5, + .c6 = V4 (-0x1.0c17d4p-6), .c7 = 0x1.7ea694p-9, +#endif + .pi_over_2 = V4 (0x3fc90fdb), + .neg_one = V4 (-1.0f), + .sign_mask = V4 (0x80000000), }; -#define SignMask v_u32 (0x80000000) - -#define P(i) d->poly[i] - +#if WANT_SIMD_EXCEPT #define TinyBound 0x30800000 /* asuint(0x1p-30). */ #define BigBound 0x4e800000 /* asuint(0x1p30). */ -#if WANT_SIMD_EXCEPT static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t x, float32x4_t y, uint32x4_t special) { @@ -51,19 +60,20 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special) /* Fast implementation of vector atanf based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] - using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps: - _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */ + using z=-1/x and shift = pi/2. Maximum observed error is 2.02 ulps: + _ZGVnN4v_atanf (0x1.03d4cep+0) got 0x1.95ed3ap-1 + want 0x1.95ed36p-1. */ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - /* Small cases, infs and nans are supported by our approximation technique, - but do not set fenv flags correctly. Only trigger special case if we need - fenv. */ uint32x4_t ix = vreinterpretq_u32_f32 (x); - uint32x4_t sign = vandq_u32 (ix, SignMask); + uint32x4_t sign = vandq_u32 (ix, d->sign_mask); #if WANT_SIMD_EXCEPT + /* Small cases, infs and nans are supported by our approximation technique, + but do not set fenv flags correctly. Only trigger special case if we need + fenv. */ uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000)); uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)), v_u32 (BigBound - TinyBound)); @@ -71,41 +81,52 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) if (__glibc_unlikely (v_any_u32 (special))) return special_case (x, x, v_u32 (-1)); #endif - /* Argument reduction: - y := arctan(x) for x < 1 - y := pi/2 + arctan(-1/x) for x > 1 - Hence, use z=-1/a if x>=1, otherwise z=a. */ - uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0)); - /* Avoid dependency in abs(x) in division (and comparison). */ - float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x); + y := arctan(x) for |x| < 1 + y := arctan(-1/x) + pi/2 for x > +1 + y := arctan(-1/x) - pi/2 for x < -1 + Hence, use z=-1/a if x>=|-1|, otherwise z=a. */ + uint32x4_t red = vcagtq_f32 (x, d->neg_one); + + float32x4_t z = vbslq_f32 (red, vdivq_f32 (d->neg_one, x), x); + + /* Shift is calculated as +-pi/2 or 0, depending on the argument case. */ float32x4_t shift = vreinterpretq_f32_u32 ( - vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2))); - /* Use absolute value only when needed (odd powers of z). */ - float32x4_t az = vbslq_f32 ( - SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z); + vandq_u32 (red, veorq_u32 (d->pi_over_2, sign))); + + float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z3 = vmulq_f32 (z, z2); + float32x4_t z4 = vmulq_f32 (z2, z2); +#if WANT_SIMD_EXCEPT /* Calculate the polynomial approximation. Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, a standard implementation using z8 creates spurious underflow in the very last fma (when z^8 is small enough). - Therefore, we split the last fma into a mul and an fma. - Horner and single-level Estrin have higher errors that exceed - threshold. */ - float32x4_t z2 = vmulq_f32 (z, z); - float32x4_t z4 = vmulq_f32 (z2, z2); - + Therefore, we split the last fma into a mul and an fma. */ float32x4_t y = vfmaq_f32 ( v_pairwise_poly_3_f32 (z2, z4, d->poly), z4, vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4))); - /* y = shift + z * P(z^2). */ - y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift); +#else + float32x4_t z8 = vmulq_f32 (z4, z4); + + /* Uses an Estrin scheme for polynomial approximation. */ + float32x4_t odd_coeffs = vld1q_f32 (&d->c1); + + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, odd_coeffs, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, odd_coeffs, 1); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, odd_coeffs, 2); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, odd_coeffs, 3); - /* y = atan(x) if x>0, -atan(-x) otherwise. */ - y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign)); + float32x4_t p03 = vfmaq_f32 (p01, z4, p23); + float32x4_t p47 = vfmaq_f32 (p45, z4, p67); - return y; + float32x4_t y = vfmaq_f32 (p03, z8, p47); +#endif + + /* y = shift + z * P(z^2). */ + return vfmaq_f32 (vaddq_f32 (shift, z), z3, y); } libmvec_hidden_def (V_NAME_F1 (atan)) HALF_WIDTH_ALIAS_F1 (atan) diff --git a/sysdeps/aarch64/fpu/atanf_sve.c b/sysdeps/aarch64/fpu/atanf_sve.c index 3a98d70c5..6558223e4 100644 --- a/sysdeps/aarch64/fpu/atanf_sve.c +++ b/sysdeps/aarch64/fpu/atanf_sve.c @@ -18,18 +18,26 @@ . */ #include "sv_math.h" -#include "poly_sve_f32.h" static const struct data { - float32_t poly[8]; - float32_t pi_over_2; + float32_t c1, c3, c5, c7; + float32_t c0, c2, c4, c6; + float32_t shift_val, neg_one; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. */ - .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, - -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, - .pi_over_2 = 0x1.921fb6p+0f, + .c0 = -0x1.5554dcp-2, + .c1 = 0x1.9978ecp-3, + .c2 = -0x1.230a94p-3, + .c3 = 0x1.b4debp-4, + .c4 = -0x1.3550dap-4, + .c5 = 0x1.61eebp-5, + .c6 = -0x1.0c17d4p-6, + .c7 = 0x1.7ea694p-9, + /* pi/2, used as a shift value after reduction. */ + .shift_val = 0x1.921fb54442d18p+0, + .neg_one = -1.0f, }; #define SignMask (0x80000000) @@ -37,43 +45,49 @@ static const struct data /* Fast implementation of SVE atanf based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=-1/x and shift = pi/2. - Largest observed error is 2.9 ULP, close to +/-1.0: - _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1 - want -0x1.967fp-1. */ + Largest observed error is 2.12 ULP: + _ZGVsMxv_atanf (0x1.03d4cep+0) got 0x1.95ed3ap-1 + want 0x1.95ed36p-1. */ svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); + svbool_t ptrue = svptrue_b32 (); /* No need to trigger special case. Small cases, infs and nans are supported by our approximation technique. */ svuint32_t ix = svreinterpret_u32 (x); - svuint32_t sign = svand_x (pg, ix, SignMask); + svuint32_t sign = svand_x (ptrue, ix, SignMask); /* Argument reduction: y := arctan(x) for x < 1 - y := pi/2 + arctan(-1/x) for x > 1 - Hence, use z=-1/a if x>=1, otherwise z=a. */ - svbool_t red = svacgt (pg, x, 1.0f); - /* Avoid dependency in abs(x) in division (and comparison). */ - svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (1.0f), x), x); - /* Use absolute value only when needed (odd powers of z). */ - svfloat32_t az = svabs_x (pg, z); - az = svneg_m (az, red, az); - - /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ - svfloat32_t z2 = svmul_x (pg, z, z); - svfloat32_t z4 = svmul_x (pg, z2, z2); - svfloat32_t z8 = svmul_x (pg, z4, z4); - - svfloat32_t y = sv_estrin_7_f32_x (pg, z2, z4, z8, d->poly); - - /* y = shift + z + z^3 * P(z^2). */ - svfloat32_t z3 = svmul_x (pg, z2, az); - y = svmla_x (pg, az, z3, y); - - /* Apply shift as indicated by 'red' predicate. */ - y = svadd_m (red, y, sv_f32 (d->pi_over_2)); - - /* y = atan(x) if x>0, -atan(-x) otherwise. */ - return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); + y := arctan(-1/x) + pi/2 for x > +1 + y := arctan(-1/x) - pi/2 for x < -1 + Hence, use z=-1/a if |x|>=|-1|, otherwise z=a. */ + svbool_t red = svacgt (pg, x, d->neg_one); + svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (d->neg_one), x), x); + + /* Reinserts the sign bit of the argument to handle the case of x < -1. */ + svfloat32_t shift = svreinterpret_f32 ( + sveor_x (red, svreinterpret_u32 (sv_f32 (d->shift_val)), sign)); + + svfloat32_t z2 = svmul_x (ptrue, z, z); + svfloat32_t z3 = svmul_x (ptrue, z2, z); + svfloat32_t z4 = svmul_x (ptrue, z2, z2); + svfloat32_t z8 = svmul_x (ptrue, z4, z4); + + svfloat32_t odd_coeffs = svld1rq (ptrue, &d->c1); + + svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); + svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); + svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); + svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), z2, odd_coeffs, 3); + + svfloat32_t p03 = svmla_x (pg, p01, z4, p23); + svfloat32_t p47 = svmla_x (pg, p45, z4, p67); + + svfloat32_t y = svmla_x (pg, p03, z8, p47); + + /* shift + z + z^3 * P(z^2). */ + shift = svadd_m (red, z, shift); + return svmla_x (pg, shift, z3, y); } diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c index 16a7cf6aa..958d69a5f 100644 --- a/sysdeps/aarch64/fpu/atanh_sve.c +++ b/sysdeps/aarch64/fpu/atanh_sve.c @@ -30,7 +30,7 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special) } /* SVE approximation for double-precision atanh, based on log1p. - The greatest observed error is 2.81 ULP: + The greatest observed error is 3.3 ULP: _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 want 0x1.ffd8ff31b501cp-6. */ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg) @@ -42,7 +42,6 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg) svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half)); /* It is special if iax >= 1. */ -// svbool_t special = svcmpge (pg, iax, One); svbool_t special = svacge (pg, x, 1.0); /* Computation is performed based on the following sequence of equality: diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c index ca4405353..f5a163b05 100644 --- a/sysdeps/aarch64/fpu/cosh_sve.c +++ b/sysdeps/aarch64/fpu/cosh_sve.c @@ -21,69 +21,99 @@ static const struct data { - float64_t poly[3]; - float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres; - uint64_t index_mask, special_bound; + double c0, c2; + double c1, c3; + float64_t inv_ln2, ln2_hi, ln2_lo, shift; + uint64_t special_bound; } data = { - .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3, - 0x1.5555576a59599p-5, }, - - .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */ - /* -ln2/N. */ - .ln2_hi = -0x1.62e42fefa39efp-9, - .ln2_lo = -0x1.abc9e3b39803f3p-64, - .shift = 0x1.8p+52, - .thres = 704.0, - - .index_mask = 0xff, - /* 0x1.6p9, above which exp overflows. */ - .special_bound = 0x4086000000000000, + /* Generated using Remez, in [-log(2)/128, log(2)/128]. */ + .c0 = 0x1.fffffffffdbcdp-2, + .c1 = 0x1.555555555444cp-3, + .c2 = 0x1.555573c6a9f7dp-5, + .c3 = 0x1.1111266d28935p-7, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + /* 1/ln2. */ + .inv_ln2 = 0x1.71547652b82fep+0, + .shift = 0x1.800000000ff80p+46, /* 1.5*2^46+1022. */ + + /* asuint(ln(2^(1024 - 1/128))), the value above which exp overflows. */ + .special_bound = 0x40862e37e7d8ba72, }; -static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) -{ - return sv_call_f64 (cosh, x, y, special); -} - -/* Helper for approximating exp(x). Copied from sv_exp_tail, with no - special-case handling or tail. */ +/* Helper for approximating exp(x)/2. + Functionally identical to FEXPA exp(x), but an adjustment in + the shift value which leads to a reduction in the exponent of scale by 1, + thus halving the result at no cost. */ static inline svfloat64_t -exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d) +exp_over_two_inline (const svbool_t pg, svfloat64_t x, const struct data *d) { /* Calculate exp(x). */ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); + svuint64_t u = svreinterpret_u64 (z); svfloat64_t n = svsub_x (pg, z, d->shift); - svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi); - r = svmla_x (pg, r, n, d->ln2_lo); + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); - svuint64_t u = svreinterpret_u64 (z); - svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); - svuint64_t i = svand_x (pg, u, d->index_mask); + svfloat64_t r = x; + r = svmls_lane (r, n, ln2, 0); + r = svmls_lane (r, n, ln2, 1); - svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]); - y = svmla_x (pg, sv_f64 (d->poly[0]), r, y); - y = svmla_x (pg, sv_f64 (1.0), r, y); - y = svmul_x (pg, r, y); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); + svfloat64_t p04 = svmla_x (pg, p01, p23, r2); + svfloat64_t p = svmla_x (pg, r, p04, r2); - /* s = 2^(n/N). */ - u = svld1_gather_index (pg, __v_exp_tail_data, i); - svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e)); + svfloat64_t scale = svexpa (u); - return svmla_x (pg, s, s, y); + return svmla_x (pg, scale, scale, p); +} + +/* Vectorised special case to handle values past where exp_inline overflows. + Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double + the valid range of inputs, and returns inf for anything past that. */ +static svfloat64_t NOINLINE +special_case (svbool_t pg, svbool_t special, svfloat64_t ax, svfloat64_t t, + const struct data *d) +{ + /* Finish fast path to compute values for non-special cases. */ + svfloat64_t inv_twoexp = svdivr_x (pg, t, 0.25); + svfloat64_t y = svadd_x (pg, t, inv_twoexp); + + /* Halves input value, and then check if any cases + are still going to overflow. */ + ax = svmul_x (special, ax, 0.5); + svbool_t is_safe + = svcmplt (special, svreinterpret_u64 (ax), d->special_bound); + + /* Computes exp(x/2), and sets any overflowing lanes to inf. */ + svfloat64_t half_exp = exp_over_two_inline (special, ax, d); + half_exp = svsel (is_safe, half_exp, sv_f64 (INFINITY)); + + /* Construct special case cosh(x) = (exp(x/2)^2)/2. */ + svfloat64_t exp = svmul_x (svptrue_b64 (), half_exp, 2); + svfloat64_t special_y = svmul_x (special, exp, half_exp); + + /* Select correct return values for special and non-special cases. */ + special_y = svsel (special, special_y, y); + + /* Ensure an input of nan is correctly propagated. */ + svbool_t is_nan + = svcmpgt (special, svreinterpret_u64 (ax), sv_u64 (0x7ff0000000000000)); + return svsel (is_nan, ax, svsel (special, special_y, y)); } /* Approximation for SVE double-precision cosh(x) using exp_inline. cosh(x) = (exp(x) + exp(-x)) / 2. - The greatest observed error is in the scalar fall-back region, so is the - same as the scalar routine, 1.93 ULP: - _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021 - want 0x1.fd774e958236fp+1021. - - The greatest observed error in the non-special region is 1.54 ULP: - _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8 - want 0x1.f5e2bb8d5c991p+8. */ + The greatest observed error in special case region is 2.66 + 0.5 ULP: + _ZGVsMxv_cosh (0x1.633b532ffbc1ap+9) got 0x1.f9b2d3d22399ep+1023 + want 0x1.f9b2d3d22399bp+1023 + + The greatest observed error in the non-special region is 1.01 + 0.5 ULP: + _ZGVsMxv_cosh (0x1.998ecbb3c1f81p+1) got 0x1.890b225657f84p+3 + want 0x1.890b225657f82p+3. */ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); @@ -92,14 +122,13 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg) svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound); /* Up to the point that exp overflows, we can use it to calculate cosh by - exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ - svfloat64_t t = exp_inline (ax, pg, d); - svfloat64_t half_t = svmul_x (pg, t, 0.5); - svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); + (exp(|x|)/2 + 1) / (2 * exp(|x|)). */ + svfloat64_t half_exp = exp_over_two_inline (pg, ax, d); - /* Fall back to scalar for any special cases. */ + /* Falls back to entirely standalone vectorized special case. */ if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svadd_x (pg, half_t, half_over_t), special); + return special_case (pg, special, ax, half_exp, d); - return svadd_x (pg, half_t, half_over_t); + svfloat64_t inv_twoexp = svdivr_x (pg, half_exp, 0.25); + return svadd_x (pg, half_exp, inv_twoexp); } diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c index fb8e06cf7..805605541 100644 --- a/sysdeps/aarch64/fpu/coshf_sve.c +++ b/sysdeps/aarch64/fpu/coshf_sve.c @@ -39,9 +39,9 @@ special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e, } /* Single-precision vector cosh, using vector expf. - Maximum error is 2.77 ULP: - _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2 - want 0x1.e4594cp+2. */ + Maximum error is 2.56 +0.5 ULP: + _ZGVsMxv_coshf(-0x1.5b40f4p+1) got 0x1.e47748p+2 + want 0x1.e4774ep+2. */ svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg) { const struct data *d = ptr_barrier (&data); diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c index 2743f9dbb..b57ab514b 100644 --- a/sysdeps/aarch64/fpu/erfcf_sve.c +++ b/sysdeps/aarch64/fpu/erfcf_sve.c @@ -76,7 +76,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg) svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx); /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ - i = svmul_x (pg, i, 2); + i = svlsl_x (svptrue_b32 (), i, 1); const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr; svfloat32_t erfcr = svld1_gather_index (pg, p, i); svfloat32_t scale = svld1_gather_index (pg, p + 1, i); @@ -84,15 +84,15 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg) /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ svfloat32_t r = svsub_x (pg, z, shift); svfloat32_t d = svsub_x (pg, a, r); - svfloat32_t d2 = svmul_x (pg, d, d); - svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third); - svfloat32_t third = svdup_lane (coeffs, 0); svfloat32_t p1 = r; - svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1); - svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0)); + svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1); + svfloat32_t p3 + = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0)); svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2); p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4); diff --git a/sysdeps/aarch64/fpu/exp10_sve.c b/sysdeps/aarch64/fpu/exp10_sve.c index f71bafdf0..53b28934d 100644 --- a/sysdeps/aarch64/fpu/exp10_sve.c +++ b/sysdeps/aarch64/fpu/exp10_sve.c @@ -18,21 +18,23 @@ . */ #include "sv_math.h" -#include "poly_sve_f64.h" #define SpecialBound 307.0 /* floor (log10 (2^1023)). */ static const struct data { - double poly[5]; + double c1, c3, c2, c4, c0; double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound; } data = { /* Coefficients generated using Remez algorithm. rel error: 0x1.9fcb9b3p-60 abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ] max ulp err 0.52 +0.5. */ - .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1, - 0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 }, + .c0 = 0x1.26bb1bbb55516p1, + .c1 = 0x1.53524c73cd32ap1, + .c2 = 0x1.0470591daeafbp1, + .c3 = 0x1.2bd77b1361ef6p0, + .c4 = 0x1.142b5d54e9621p-1, /* 1.5*2^46+1023. This value is further explained below. */ .shift = 0x1.800000000ffc0p+46, .log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */ @@ -70,9 +72,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, /* |n| > 1280 => 2^(n) overflows. */ svbool_t p_cmp = svacgt (pg, n, d->scale_thres); - svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); svfloat64_t r2 = svmla_x (pg, s2, s2, y); - svfloat64_t r0 = svmul_x (pg, r2, s1); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); return svsel (p_cmp, r1, r0); } @@ -103,11 +105,14 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg) comes at significant performance cost. */ svuint64_t u = svreinterpret_u64 (z); svfloat64_t scale = svexpa (u); - + svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); /* Approximate exp10(r) using polynomial. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2, - sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1)); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); + svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); + svfloat64_t p14 = svmla_x (pg, p12, p34, r2); + + svfloat64_t y = svmla_x (pg, svmul_x (svptrue_b64 (), r, d->c0), r2, p14); /* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound multiplication may overflow, so use special case routine. */ diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c index 1a74db265..f3e7f8b4f 100644 --- a/sysdeps/aarch64/fpu/exp10f_sve.c +++ b/sysdeps/aarch64/fpu/exp10f_sve.c @@ -19,26 +19,19 @@ #include "sv_math.h" -/* For x < -Thres, the result is subnormal and not handled correctly by - FEXPA. */ -#define Thres 37.9 +/* For x < -Thres (-log10(2^126)), the result is subnormal and not handled + correctly by FEXPA. */ +#define Thres 0x1.2f702p+5 static const struct data { - float log2_10_lo, c0, c2, c4; - float c1, c3, log10_2; - float shift, log2_10_hi, thres; + float log10_2, log2_10_hi, log2_10_lo, c1; + float c0, shift, thres; } data = { /* Coefficients generated using Remez algorithm with minimisation of relative - error. - rel error: 0x1.89dafa3p-24 - abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] - maxerr: 0.52 +0.5 ulp. */ - .c0 = 0x1.26bb16p+1f, - .c1 = 0x1.5350d2p+1f, - .c2 = 0x1.04744ap+1f, - .c3 = 0x1.2d8176p+0f, - .c4 = 0x1.12b41ap-1f, + error. */ + .c0 = 0x1.26bb62p1, + .c1 = 0x1.53524cp1, /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */ .shift = 0x1.803f8p17f, .log10_2 = 0x1.a934fp+1, @@ -53,28 +46,23 @@ sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d) /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)), with poly(r) in [1/sqrt(2), sqrt(2)] and x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */ - - svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo); + svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log10_2); /* n = round(x/(log10(2)/N)). */ svfloat32_t shift = sv_f32 (d->shift); - svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift); - svfloat32_t n = svsub_x (svptrue_b32 (), z, shift); + svfloat32_t z = svmla_lane (shift, x, lane_consts, 0); + svfloat32_t n = svsub_x (pg, z, shift); /* r = x - n*log10(2)/N. */ - svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x); - r = svmls_lane (r, n, lane_consts, 0); + svfloat32_t r = x; + r = svmls_lane (r, n, lane_consts, 1); + r = svmls_lane (r, n, lane_consts, 2); svfloat32_t scale = svexpa (svreinterpret_u32 (z)); /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */ - svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); - svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); - svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); - svfloat32_t p14 = svmla_x (pg, p12, p34, r2); - svfloat32_t p0 = svmul_lane (r, lane_consts, 1); - svfloat32_t poly = svmla_x (pg, p0, r2, p14); - + svfloat32_t poly = svmla_lane (sv_f32 (d->c0), r, lane_consts, 3); + poly = svmul_x (pg, poly, r); return svmla_x (pg, scale, scale, poly); } @@ -85,11 +73,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d) special); } -/* Single-precision SVE exp10f routine. Implements the same algorithm - as AdvSIMD exp10f. - Worst case error is 1.02 ULPs. - _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1 - want 0x1.ba5f9cp-1. */ +/* Single-precision SVE exp10f routine. Based on the FEXPA instruction. + Worst case error is 1.10 ULP. + _ZGVsMxv_exp10f (0x1.cc76dep+3) got 0x1.be0172p+47 + want 0x1.be017p+47. */ svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c index a37c33092..c13585253 100644 --- a/sysdeps/aarch64/fpu/exp2_sve.c +++ b/sysdeps/aarch64/fpu/exp2_sve.c @@ -18,25 +18,22 @@ . */ #include "sv_math.h" -#include "poly_sve_f64.h" - -#define N (1 << V_EXP_TABLE_BITS) #define BigBound 1022 #define UOFlowBound 1280 static const struct data { - double poly[4]; + double c2, c4; + double c0, c1, c3; double shift, big_bound, uoflow_bound; } data = { /* Coefficients are computed using Remez algorithm with minimisation of the absolute error. */ - .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5, - 0x1.3b2abf5571ad8p-7 }, - .shift = 0x1.8p52 / N, - .uoflow_bound = UOFlowBound, - .big_bound = BigBound, + .c0 = 0x1.62e42fefa39efp-1, .c1 = 0x1.ebfbdff82a31bp-3, + .c2 = 0x1.c6b08d706c8a5p-5, .c3 = 0x1.3b2ad2ff7d2f3p-7, + .c4 = 0x1.5d8761184beb3p-10, .shift = 0x1.800000000ffc0p+46, + .uoflow_bound = UOFlowBound, .big_bound = BigBound, }; #define SpecialOffset 0x6000000000000000 /* 0x1p513. */ @@ -65,47 +62,52 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); /* |n| > 1280 => 2^(n) overflows. */ - svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound); + svbool_t p_cmp = svacle (pg, n, d->uoflow_bound); - svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); svfloat64_t r2 = svmla_x (pg, s2, s2, y); - svfloat64_t r0 = svmul_x (pg, r2, s1); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); - return svsel (p_cmp, r1, r0); + return svsel (p_cmp, r0, r1); } /* Fast vector implementation of exp2. - Maximum measured error is 1.65 ulp. - _ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1 - want 0x1.f8db0d4df721dp-1. */ + Maximum measured error is 0.52 + 0.5 ulp. + _ZGVsMxv_exp2 (0x1.3b72ad5b701bfp-1) got 0x1.8861641b49e08p+0 + want 0x1.8861641b49e07p+0. */ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg) { const struct data *d = ptr_barrier (&data); - svbool_t no_big_scale = svacle (pg, x, d->big_bound); - svbool_t special = svnot_z (pg, no_big_scale); - - /* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N]. */ - svfloat64_t shift = sv_f64 (d->shift); - svfloat64_t kd = svadd_x (pg, x, shift); - svuint64_t ki = svreinterpret_u64 (kd); - /* kd = k/N. */ - kd = svsub_x (pg, kd, shift); - svfloat64_t r = svsub_x (pg, x, kd); - - /* scale ~= 2^(k/N). */ - svuint64_t idx = svand_x (pg, ki, N - 1); - svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx); - /* This is only a valid scale when -1023*N < k < 1024*N. */ - svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS); - svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top)); + svbool_t special = svacge (pg, x, d->big_bound); + + svfloat64_t z = svadd_x (svptrue_b64 (), x, d->shift); + svfloat64_t n = svsub_x (svptrue_b64 (), z, d->shift); + svfloat64_t r = svsub_x (svptrue_b64 (), x, n); + + svfloat64_t scale = svexpa (svreinterpret_u64 (z)); + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); /* Approximate exp2(r) using polynomial. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly); - svfloat64_t y = svmul_x (pg, r, p); + /* y = exp2(r) - 1 ~= r * (C0 + C1 r + C2 r^2 + C3 r^3 + C4 r^4). */ + svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); + svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); + svfloat64_t p = svmla_x (pg, p12, p34, r2); + p = svmad_x (pg, p, r, d->c0); + svfloat64_t y = svmul_x (svptrue_b64 (), r, p); /* Assemble exp2(x) = exp2(r) * scale. */ if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (pg, scale, y, kd, d); + { + /* FEXPA zeroes the sign bit, however the sign is meaningful to the + special case function so needs to be copied. + e = sign bit of u << 46. */ + svuint64_t e = svand_x (pg, svlsl_x (pg, svreinterpret_u64 (z), 46), + 0x8000000000000000); + scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale))); + return special_case (pg, scale, y, n, d); + } + return svmla_x (pg, scale, scale, y); } diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c index fcd783016..989cefb60 100644 --- a/sysdeps/aarch64/fpu/exp2f_sve.c +++ b/sysdeps/aarch64/fpu/exp2f_sve.c @@ -18,21 +18,17 @@ . */ #include "sv_math.h" -#include "poly_sve_f32.h" #define Thres 0x1.5d5e2ap+6f static const struct data { - float c0, c2, c4, c1, c3; - float shift, thres; + float c0, c1, shift, thres; } data = { - /* Coefficients copied from the polynomial in AdvSIMD variant. */ - .c0 = 0x1.62e422p-1f, - .c1 = 0x1.ebf9bcp-3f, - .c2 = 0x1.c6bd32p-5f, - .c3 = 0x1.3ce9e4p-7f, - .c4 = 0x1.59977ap-10f, + /* Coefficients generated using Remez algorithm with minimisation of relative + error. */ + .c0 = 0x1.62e485p-1, + .c1 = 0x1.ebfbe0p-3, /* 1.5*2^17 + 127. */ .shift = 0x1.803f8p17f, /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled @@ -51,16 +47,8 @@ sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d) svfloat32_t scale = svexpa (svreinterpret_u32 (z)); - /* Polynomial evaluation: poly(r) ~ exp2(r)-1. - Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for - coefficients 1 to 4, and apply most significant coefficient directly. */ - svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0); - svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); - svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1); - svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2); - svfloat32_t p14 = svmla_x (pg, p12, r2, p34); - svfloat32_t p0 = svmul_lane (r, even_coeffs, 0); - svfloat32_t poly = svmla_x (pg, p0, r2, p14); + svfloat32_t poly = svmla_x (pg, sv_f32 (d->c0), r, sv_f32 (d->c1)); + poly = svmul_x (svptrue_b32 (), poly, r); return svmla_x (pg, scale, scale, poly); } @@ -72,11 +60,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d) special); } -/* Single-precision SVE exp2f routine. Implements the same algorithm - as AdvSIMD exp2f. - Worst case error is 1.04 ULPs. - _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1 - want 0x1.ba6a64p-1. */ +/* Single-precision SVE exp2f routine, based on the FEXPA instruction. + Worst case error is 1.09 ULPs. + _ZGVsMxv_exp2f (0x1.9a2a94p-1) got 0x1.be1054p+0 + want 0x1.be1052p+0. */ svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); diff --git a/sysdeps/aarch64/fpu/exp_sve.c b/sysdeps/aarch64/fpu/exp_sve.c index 37de751f9..dc049482e 100644 --- a/sysdeps/aarch64/fpu/exp_sve.c +++ b/sysdeps/aarch64/fpu/exp_sve.c @@ -21,12 +21,15 @@ static const struct data { - double poly[4]; + double c0, c2; + double c1, c3; double ln2_hi, ln2_lo, inv_ln2, shift, thres; + } data = { - .poly = { /* ulp error: 0.53. */ - 0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5, - 0x1.1111266d28935p-7 }, + .c0 = 0x1.fffffffffdbcdp-2, + .c1 = 0x1.555555555444cp-3, + .c2 = 0x1.555573c6a9f7dp-5, + .c3 = 0x1.1111266d28935p-7, .ln2_hi = 0x1.62e42fefa3800p-1, .ln2_lo = 0x1.ef35793c76730p-45, /* 1/ln2. */ @@ -36,7 +39,6 @@ static const struct data .thres = 704.0, }; -#define C(i) sv_f64 (d->poly[i]) #define SpecialOffset 0x6000000000000000 /* 0x1p513. */ /* SpecialBias1 + SpecialBias1 = asuint(1.0). */ #define SpecialBias1 0x7000000000000000 /* 0x1p769. */ @@ -56,20 +58,20 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n) svuint64_t b = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */ - /* Set s1 to generate overflow depending on sign of exponent n. */ - svfloat64_t s1 = svreinterpret_f64 ( - svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */ - /* Offset s to avoid overflow in final result if n is below threshold. */ + /* Set s1 to generate overflow depending on sign of exponent n, + ie. s1 = 0x70...0 - b. */ + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. + ie. s2 = as_u64 (s) - 0x3010...0 + b. */ svfloat64_t s2 = svreinterpret_f64 ( - svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), - b)); /* as_u64 (s) - 0x3010...0 + b. */ + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); /* |n| > 1280 => 2^(n) overflows. */ svbool_t p_cmp = svacgt (pg, n, 1280.0); - svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); svfloat64_t r2 = svmla_x (pg, s2, s2, y); - svfloat64_t r0 = svmul_x (pg, r2, s1); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); return svsel (p_cmp, r1, r0); } @@ -103,16 +105,16 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg) svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); svuint64_t u = svreinterpret_u64 (z); svfloat64_t n = svsub_x (pg, z, d->shift); - + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); svfloat64_t r = svmls_lane (x, n, ln2, 0); r = svmls_lane (r, n, ln2, 1); /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t p01 = svmla_x (pg, C (0), C (1), r); - svfloat64_t p23 = svmla_x (pg, C (2), C (3), r); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); svfloat64_t p04 = svmla_x (pg, p01, p23, r2); svfloat64_t y = svmla_x (pg, r, p04, r2); diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c index f9249db8b..c3619975b 100644 --- a/sysdeps/aarch64/fpu/expf_sve.c +++ b/sysdeps/aarch64/fpu/expf_sve.c @@ -40,9 +40,9 @@ special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d) } /* Optimised single-precision SVE exp function. - Worst-case error is 1.04 ulp: - SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4 - want 0x1.ba74bap+4. */ + Worst-case error is 0.88 +0.50 ULP: + _ZGVsMxv_expf(-0x1.bba276p-6) got 0x1.f25288p-1 + want 0x1.f2528ap-1. */ svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); diff --git a/sysdeps/aarch64/fpu/expm1_sve.c b/sysdeps/aarch64/fpu/expm1_sve.c index d4ba8ccf3..b1d940bd2 100644 --- a/sysdeps/aarch64/fpu/expm1_sve.c +++ b/sysdeps/aarch64/fpu/expm1_sve.c @@ -18,82 +18,164 @@ . */ #include "sv_math.h" -#include "poly_sve_f64.h" -#define SpecialBound 0x1.62b7d369a5aa9p+9 -#define ExponentBias 0x3ff0000000000000 +#define FexpaBound 0x1.4cb5ecef28adap-3 /* 15*ln2/64. */ +#define SpecialBound 0x1.628c2855bfaddp+9 /* ln(2^(1023 + 1/128)). */ static const struct data { - double poly[11]; - double shift, inv_ln2, special_bound; - /* To be loaded in one quad-word. */ + double c2, c4; + double inv_ln2; double ln2_hi, ln2_lo; + double c0, c1, c3; + double shift, thres; + uint64_t expm1_data[32]; } data = { - /* Generated using fpminimax. */ - .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, - 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13, - 0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, - 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, - - .special_bound = SpecialBound, - .inv_ln2 = 0x1.71547652b82fep0, - .ln2_hi = 0x1.62e42fefa39efp-1, - .ln2_lo = 0x1.abc9e3b39803fp-56, - .shift = 0x1.8p52, + /* Table emulating FEXPA - 1, for values of FEXPA close to 1. + The table holds values of 2^(i/64) - 1, computed in arbitrary precision. + The first half of the table stores values associated to i from 0 to 15. + The second half of the table stores values associated to i from 0 to -15. */ + .expm1_data = { + 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901, + 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb, + 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2, + 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a, + 0x0000000000000000, 0xbfc331751ec3a814, 0xbfc20224341286e4, 0xbfc0cf85bed0f8b7, + 0xbfbf332113d56b1f, 0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424, + 0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a, 0xbfaafd11874c009e, + 0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89, 0xbf95f134923757f3, 0xbf860f9f985bc9f4, + }, + + /* Generated using Remez, in [-log(2)/128, log(2)/128]. */ + .c0 = 0x1p-1, + .c1 = 0x1.55555555548f9p-3, + .c2 = 0x1.5555555554c22p-5, + .c3 = 0x1.111123aaa2fb2p-7, + .c4 = 0x1.6c16d77d98e5bp-10, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + .inv_ln2 = 0x1.71547652b82fep+0, + .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */ + .thres = SpecialBound, }; -static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t pg) +#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +static NOINLINE svfloat64_t +special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p, + svfloat64_t n) { - return sv_call_f64 (expm1, x, y, pg); + /* s=2^n may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + + /* If n<=0 then set b to 0x6, 0 otherwise. */ + svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */ + svuint64_t b + = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */ + + /* Set s1 to generate overflow depending on sign of exponent n, + ie. s1 = 0x70...0 - b. */ + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. + ie. s2 = as_u64 (s) - 0x3010...0 + b. */ + svfloat64_t s2 = svreinterpret_f64 ( + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); + + /* |n| > 1280 => 2^(n) overflows. */ + svbool_t p_cmp = svacgt (pg, n, 1280.0); + + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); + svfloat64_t r2 = svmla_x (pg, s2, s2, p); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); + + svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes. */ + return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0)); } -/* Double-precision vector exp(x) - 1 function. - The maximum error observed error is 2.18 ULP: - _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 - want 0x1.a8b9ea8d66e2p-2. */ +/* FEXPA based SVE expm1 algorithm. + Maximum measured error is 2.81 + 0.5 ULP: + _ZGVsMxv_expm1 (0x1.974060e619bfp-3) got 0x1.c290e5858bb53p-3 + want 0x1.c290e5858bb5p-3. */ svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg) { const struct data *d = ptr_barrier (&data); - /* Large, Nan/Inf. */ - svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound)); - - /* Reduce argument to smaller range: - Let i = round(x / ln2) - and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. - exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 - where 2^i is exact because i is an integer. */ - svfloat64_t shift = sv_f64 (d->shift); - svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift); - svint64_t i = svcvt_s64_x (pg, n); - svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); - svfloat64_t f = svmls_lane (x, n, ln2, 0); - f = svmls_lane (f, n, ln2, 1); - - /* Approximate expm1(f) using polynomial. - Taylor expansion for expm1(x) has the form: - x + ax^2 + bx^3 + cx^4 .... - So we calculate the polynomial P(f) = a + bf + cf^2 + ... - and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - svfloat64_t f2 = svmul_x (pg, f, f); - svfloat64_t f4 = svmul_x (pg, f2, f2); - svfloat64_t f8 = svmul_x (pg, f4, f4); - svfloat64_t p - = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly)); - - /* Assemble the result. - expm1(x) ~= 2^i * (p + 1) - 1 - Let t = 2^i. */ - svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias); - svfloat64_t t = svreinterpret_f64 (u); - - /* expm1(x) ~= p * t + (t - 1). */ - svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t); + svbool_t special = svacgt (pg, x, d->thres); - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, y, special); + svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); + svuint64_t u = svreinterpret_u64 (z); + svfloat64_t n = svsub_x (pg, z, d->shift); + /* r = x - n * ln2, r is in [-ln2/128, ln2/128]. */ + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); + svfloat64_t r = x; + r = svmls_lane (r, n, ln2, 0); + r = svmls_lane (r, n, ln2, 1); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); + + svfloat64_t p; + svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); + svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); + p = svmad_x (pg, c34, r2, c12); + p = svmad_x (pg, p, r, sv_f64 (d->c0)); + p = svmad_x (pg, p, r2, r); + + svfloat64_t scale = svexpa (u); + svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0)); + + /* We want to construct expm1(x) = (scale - 1) + scale * poly. + However, for values of scale close to 1, scale-1 causes large ULP errors + due to cancellation. + + This can be circumvented by using a small lookup for scale-1 + when our input is below a certain bound, otherwise we can use FEXPA. + + This bound is based upon the table size: + Bound = (TableSize-1/64) * ln2. + The current bound is based upon a table size of 16. */ + svbool_t is_small = svaclt (pg, x, FexpaBound); + + if (svptest_any (pg, is_small)) + { + /* Index via the input of FEXPA, but we only care about the lower 4 bits. + */ + svuint64_t base_idx = svand_x (pg, u, 0xf); + + /* We can use the sign of x as a fifth bit to account for the asymmetry + of e^x around 0. */ + svuint64_t signBit + = svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4); + svuint64_t idx = svorr_x (pg, base_idx, signBit); + + /* Lookup values for scale - 1 for small x. */ + svfloat64_t lookup = svreinterpret_f64 ( + svld1_gather_index (is_small, d->expm1_data, idx)); + + /* Select the appropriate scale - 1 value based on x. */ + scalem1 = svsel (is_small, lookup, scalem1); + } + + svfloat64_t y = svmla_x (pg, scalem1, scale, p); + + /* FEXPA returns nan for large inputs so we special case those. */ + if (__glibc_unlikely (svptest_any (pg, special))) + { + /* FEXPA zeroes the sign bit, however the sign is meaningful to the + special case function so needs to be copied. + e = sign bit of u << 46. */ + svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000); + /* Copy sign to s. */ + scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale))); + return special_case (pg, y, scale, p, n); + } + + /* return expm1 = (scale - 1) + (scale * poly). */ return y; } diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c index 862c13f81..821c0780a 100644 --- a/sysdeps/aarch64/fpu/log1p_sve.c +++ b/sysdeps/aarch64/fpu/log1p_sve.c @@ -22,19 +22,33 @@ static const struct data { - double poly[19]; + float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16; + float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c18; double ln2_hi, ln2_lo; uint64_t hfrt2_top, onemhfrt2_top, inf, mone; } data = { /* Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20 - polynomial, however first 2 coefficients are 0 and 1 so are not stored. */ - .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, - 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, - -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, - 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, - -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, - 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, - -0x1.cfa7385bdb37ep-6, }, + polynomial, however first 2 coefficients are 0 and 1 so are not + stored. */ + .c0 = -0x1.ffffffffffffbp-2, + .c1 = 0x1.55555555551a9p-2, + .c2 = -0x1.00000000008e3p-2, + .c3 = 0x1.9999999a32797p-3, + .c4 = -0x1.555555552fecfp-3, + .c5 = 0x1.249248e071e5ap-3, + .c6 = -0x1.ffffff8bf8482p-4, + .c7 = 0x1.c71c8f07da57ap-4, + .c8 = -0x1.9999ca4ccb617p-4, + .c9 = 0x1.7459ad2e1dfa3p-4, + .c10 = -0x1.554d2680a3ff2p-4, + .c11 = 0x1.3b4c54d487455p-4, + .c12 = -0x1.2548a9ffe80e6p-4, + .c13 = 0x1.0f389a24b2e07p-4, + .c14 = -0x1.eee4db15db335p-5, + .c15 = 0x1.e95b494d4a5ddp-5, + .c16 = -0x1.15fdf07cb7c73p-4, + .c17 = 0x1.0310b70800fcfp-4, + .c18 = -0x1.cfa7385bdb37ep-6, .ln2_hi = 0x1.62e42fefa3800p-1, .ln2_lo = 0x1.ef35793c76730p-45, /* top32(asuint64(sqrt(2)/2)) << 32. */ @@ -49,7 +63,7 @@ static const struct data #define BottomMask 0xffffffff static svfloat64_t NOINLINE -special_case (svbool_t special, svfloat64_t x, svfloat64_t y) +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) { return sv_call_f64 (log1p, x, y, special); } @@ -91,8 +105,9 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ svuint64_t utop = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hfrt2_top); - svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask)); - svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1); + svuint64_t u_red + = svorr_x (pg, utop, svand_x (svptrue_b64 (), mi, BottomMask)); + svfloat64_t f = svsub_x (svptrue_b64 (), svreinterpret_f64 (u_red), 1); /* Correction term c/m. */ svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, 1)), m); @@ -103,18 +118,49 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) Hence approximation has the form f + f^2 * P(f) where P(x) = C0 + C1*x + C2x^2 + ... Assembling this all correctly is dealt with at the final step. */ - svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2), - f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8); - svfloat64_t p = sv_estrin_18_f64_x (pg, f, f2, f4, f8, f16, d->poly); + svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f), + f4 = svmul_x (svptrue_b64 (), f2, f2), + f8 = svmul_x (svptrue_b64 (), f4, f4), + f16 = svmul_x (svptrue_b64 (), f8, f8); + + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); + svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5); + svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9); + svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13); + svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17); + + /* Order-18 Estrin scheme. */ + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1); + + svfloat64_t p03 = svmla_x (pg, p01, f2, p23); + svfloat64_t p47 = svmla_x (pg, p45, f2, p67); + svfloat64_t p07 = svmla_x (pg, p03, f4, p47); + + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1); + svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0); + svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1); + + svfloat64_t p811 = svmla_x (pg, p89, f2, p1011); + svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415); + svfloat64_t p815 = svmla_x (pg, p811, f4, p1215); + + svfloat64_t p015 = svmla_x (pg, p07, f8, p815); + svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0); + svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1); + svfloat64_t p = svmla_x (pg, p015, f16, p1618); svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo); svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi); - svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p); if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (special, x, y); - - return y; + return special_case ( + x, svmla_x (svptrue_b64 (), svadd_x (svptrue_b64 (), ylo, yhi), f2, p), + special); + return svmla_x (svptrue_b64 (), svadd_x (svptrue_b64 (), ylo, yhi), f2, p); } strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1)) diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c index 42d551ca9..b8c1b39dc 100644 --- a/sysdeps/aarch64/fpu/pow_sve.c +++ b/sysdeps/aarch64/fpu/pow_sve.c @@ -44,19 +44,18 @@ /* Data is defined in v_pow_log_data.c. */ #define N_LOG (1 << V_POW_LOG_TABLE_BITS) -#define A __v_pow_log_data.poly #define Off 0x3fe6955500000000 /* Data is defined in v_pow_exp_data.c. */ #define N_EXP (1 << V_POW_EXP_TABLE_BITS) #define SignBias (0x800 << V_POW_EXP_TABLE_BITS) -#define C __v_pow_exp_data.poly #define SmallExp 0x3c9 /* top12(0x1p-54). */ #define BigExp 0x408 /* top12(512.). */ #define ThresExp 0x03f /* BigExp - SmallExp. */ #define HugeExp 0x409 /* top12(1024.). */ /* Constants associated with pow. */ +#define SmallBoundX 0x1p-126 #define SmallPowX 0x001 /* top12(0x1p-126). */ #define BigPowX 0x7ff /* top12(INFINITY). */ #define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ @@ -64,6 +63,31 @@ #define BigPowY 0x43e /* top12(0x1.749p62). */ #define ThresPowY 0x080 /* BigPowY - SmallPowY. */ +static const struct data +{ + double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo; + double log_c1, log_c3, log_c5, off; + double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo; + double exp_c0, exp_c1; +} data = { + .log_c0 = -0x1p-1, + .log_c1 = -0x1.555555555556p-1, + .log_c2 = 0x1.0000000000006p-1, + .log_c3 = 0x1.999999959554ep-1, + .log_c4 = -0x1.555555529a47ap-1, + .log_c5 = -0x1.2495b9b4845e9p0, + .log_c6 = 0x1.0002b8b263fc3p0, + .off = Off, + .exp_c0 = 0x1.fffffffffffd4p-2, + .exp_c1 = 0x1.5555571d6ef9p-3, + .exp_c2 = 0x1.5555576a5adcep-5, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP, + .ln2_over_n_hi = 0x1.62e42fefc0000p-9, + .ln2_over_n_lo = -0x1.c610ca86c3899p-45, +}; + /* Check if x is an integer. */ static inline svbool_t sv_isint (svbool_t pg, svfloat64_t x) @@ -82,7 +106,7 @@ sv_isnotint (svbool_t pg, svfloat64_t x) static inline svbool_t sv_isodd (svbool_t pg, svfloat64_t x) { - svfloat64_t y = svmul_x (pg, x, 0.5); + svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5); return sv_isnotint (pg, y); } @@ -121,7 +145,7 @@ zeroinfnan (uint64_t i) static inline svbool_t sv_zeroinfnan (svbool_t pg, svuint64_t i) { - return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1), + return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1), 2 * asuint64 (INFINITY) - 1); } @@ -174,16 +198,17 @@ sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2, additional 15 bits precision. IX is the bit representation of x, but normalized in the subnormal range using the sign bit for the exponent. */ static inline svfloat64_t -sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail) +sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail, + const struct data *d) { /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t tmp = svsub_x (pg, ix, d->off); svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS), sv_u64 (N_LOG - 1)); svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); - svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52))); + svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52)); svfloat64_t z = svreinterpret_f64 (iz); svfloat64_t kd = svcvt_f64_x (pg, k); @@ -199,40 +224,85 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail) |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ svfloat64_t r = svmad_x (pg, z, invc, -1.0); /* k*Ln2 + log(c) + r. */ - svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi); + + svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi); + svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0); svfloat64_t t2 = svadd_x (pg, t1, r); - svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo); + svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1); svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r); /* Evaluation is optimized assuming superscalar pipelined execution. */ - svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */ - svfloat64_t ar2 = svmul_x (pg, r, ar); - svfloat64_t ar3 = svmul_x (pg, r, ar2); + + svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0); + svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0); + svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar); + svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2); /* k*Ln2 + log(c) + r + A[0]*r*r. */ svfloat64_t hi = svadd_x (pg, t2, ar2); - svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r); + svfloat64_t lo3 = svmls_x (pg, ar2, ar, r); svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2); /* p = log1p(r) - r - A[0]*r*r. */ /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r * A[6])))). */ - svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]); - svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]); - svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]); + + svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4); + svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1); + svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0); + svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1); svfloat64_t p = svmla_x (pg, a34, ar2, a56); p = svmla_x (pg, a12, ar2, p); - p = svmul_x (pg, ar3, p); + p = svmul_x (svptrue_b64 (), ar3, p); svfloat64_t lo = svadd_x ( - pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p); + pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p); svfloat64_t y = svadd_x (pg, hi, lo); *tail = svadd_x (pg, svsub_x (pg, hi, y), lo); return y; } +static inline svfloat64_t +sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail, + svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits, + svuint64_t *ki, const struct data *d) +{ + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2); + svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0); + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + svfloat64_t kd = svrinta_x (pg, z); + *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd)); + + svfloat64_t ln2_over_n_hilo + = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi); + svfloat64_t r = x; + r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0); + r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1); + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r = svadd_x (pg, r, xtail); + /* 2^(k/N) ~= scale. */ + svuint64_t idx = svand_x (pg, *ki, N_EXP - 1); + svuint64_t top + = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx); + *sbits = svadd_x (pg, *sbits, top); + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1); + *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp); + *tmp = svmla_x (pg, r, r2, *tmp); + svfloat64_t scale = svreinterpret_f64 (*sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + z = svmla_x (pg, scale, scale, *tmp); + return z; +} + /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ static inline svfloat64_t sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, - svuint64_t sign_bias) + svuint64_t sign_bias, const struct data *d) { /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow) and other cases of large values of x (scale * (1 + TMP) oflow). */ @@ -240,73 +310,46 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */ svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp); - /* Conditions special, uflow and oflow are all expressed as uoflow && - something, hence do not bother computing anything if no lane in uoflow is - true. */ - svbool_t special = svpfalse_b (); - svbool_t uflow = svpfalse_b (); - svbool_t oflow = svpfalse_b (); + svfloat64_t tmp; + svuint64_t sbits, ki; if (__glibc_unlikely (svptest_any (pg, uoflow))) { + svfloat64_t z + = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d); + /* |x| is tiny (|x| <= 0x1p-54). */ - uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000); + svbool_t uflow + = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000); uflow = svand_z (pg, uoflow, uflow); /* |x| is huge (|x| >= 1024). */ - oflow = svcmpge (pg, abstop, HugeExp); + svbool_t oflow = svcmpge (pg, abstop, HugeExp); oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow)); + /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow - or underflow. */ - special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); + or underflow. */ + svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); + + /* Update result with special and large cases. */ + z = sv_call_specialcase (tmp, sbits, ki, z, special); + + /* Handle underflow and overflow. */ + svbool_t x_is_neg = svcmplt (pg, x, 0); + svuint64_t sign_mask + = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); + svfloat64_t res_uoflow + = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY)); + res_uoflow = svreinterpret_f64 ( + svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask)); + /* Avoid spurious underflow for tiny x. */ + svfloat64_t res_spurious_uflow + = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000)); + + z = svsel (oflow, res_uoflow, z); + z = svsel (uflow, res_spurious_uflow, z); + return z; } - /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ - /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ - svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2); - /* z - kd is in [-1, 1] in non-nearest rounding modes. */ - svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift); - svfloat64_t kd = svadd_x (pg, z, shift); - svuint64_t ki = svreinterpret_u64 (kd); - kd = svsub_x (pg, kd, shift); - svfloat64_t r = x; - r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi); - r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo); - /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ - r = svadd_x (pg, r, xtail); - /* 2^(k/N) ~= scale. */ - svuint64_t idx = svand_x (pg, ki, N_EXP - 1); - svuint64_t top - = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS); - /* This is only a valid scale when -1023*N < k < 1024*N. */ - svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx); - sbits = svadd_x (pg, sbits, top); - /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]); - tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp); - tmp = svmla_x (pg, r, r2, tmp); - svfloat64_t scale = svreinterpret_f64 (sbits); - /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there - is no spurious underflow here even without fma. */ - z = svmla_x (pg, scale, scale, tmp); - - /* Update result with special and large cases. */ - if (__glibc_unlikely (svptest_any (pg, special))) - z = sv_call_specialcase (tmp, sbits, ki, z, special); - - /* Handle underflow and overflow. */ - svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63); - svbool_t x_is_neg = svcmpne (pg, sign_bit, 0); - svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); - svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY)); - res_uoflow = svreinterpret_f64 ( - svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask)); - z = svsel (oflow, res_uoflow, z); - /* Avoid spurious underflow for tiny x. */ - svfloat64_t res_spurious_uflow - = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000)); - z = svsel (uflow, res_spurious_uflow, z); - - return z; + return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d); } static inline double @@ -341,47 +384,39 @@ pow_sc (double x, double y) svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + /* This preamble handles special case conditions used in the final scalar fallbacks. It also updates ix and sign_bias, that are used in the core computation too, i.e., exp( y * log (x) ). */ svuint64_t vix0 = svreinterpret_u64 (x); svuint64_t viy0 = svreinterpret_u64 (y); - svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52); /* Negative x cases. */ - svuint64_t sign_bit = svlsr_m (pg, vix0, 63); - svbool_t xisneg = svcmpeq (pg, sign_bit, 1); + svbool_t xisneg = svcmplt (pg, x, 0); /* Set sign_bias and ix depending on sign of x and nature of y. */ - svbool_t yisnotint_xisneg = svpfalse_b (); + svbool_t yint_or_xpos = pg; svuint64_t sign_bias = sv_u64 (0); svuint64_t vix = vix0; - svuint64_t vtopx1 = vtopx0; if (__glibc_unlikely (svptest_any (pg, xisneg))) { /* Determine nature of y. */ - yisnotint_xisneg = sv_isnotint (xisneg, y); - svbool_t yisint_xisneg = sv_isint (xisneg, y); + yint_or_xpos = sv_isint (xisneg, y); svbool_t yisodd_xisneg = sv_isodd (xisneg, y); /* ix set to abs(ix) if y is integer. */ - vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff); - vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff); + vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff); /* Set to SignBias if x is negative and y is odd. */ sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0)); } - /* Special cases of x or y: zero, inf and nan. */ - svbool_t xspecial = sv_zeroinfnan (pg, vix0); - svbool_t yspecial = sv_zeroinfnan (pg, viy0); - svbool_t special = svorr_z (pg, xspecial, yspecial); - /* Small cases of x: |x| < 0x1p-126. */ - svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff); - svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX); - if (__glibc_unlikely (svptest_any (pg, xsmall))) + svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX); + if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall))) { /* Normalize subnormal x so exponent becomes negative. */ - svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0); + svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52); + svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0); svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52)); vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff); @@ -391,20 +426,24 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) /* y_hi = log(ix, &y_lo). */ svfloat64_t vlo; - svfloat64_t vhi = sv_log_inline (pg, vix, &vlo); + svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d); /* z = exp(y_hi, y_lo, sign_bias). */ - svfloat64_t vehi = svmul_x (pg, y, vhi); - svfloat64_t velo = svmul_x (pg, y, vlo); - svfloat64_t vemi = svmls_x (pg, vehi, y, vhi); - velo = svsub_x (pg, velo, vemi); - svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias); + svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi); + svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi); + svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo); + svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d); /* Cases of finite y and finite negative x. */ - vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz); + vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan (""))); + + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0); + svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0); + svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial); /* Cases of zero/inf/nan x or y. */ - if (__glibc_unlikely (svptest_any (pg, special))) + if (__glibc_unlikely (svptest_any (svptrue_b64 (), special))) vz = sv_call2_f64 (pow_sc, x, y, vz, special); return vz; diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c index 29e9acb6f..7046990aa 100644 --- a/sysdeps/aarch64/fpu/powf_sve.c +++ b/sysdeps/aarch64/fpu/powf_sve.c @@ -26,7 +26,6 @@ #define Tlogc __v_powf_data.logc #define Texp __v_powf_data.scale #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11)) -#define Shift 0x1.8p52 #define Norm 0x1p23f /* 0x4b000000. */ /* Overall ULP error bound for pow is 2.6 ulp @@ -36,7 +35,7 @@ static const struct data double log_poly[4]; double exp_poly[3]; float uflow_bound, oflow_bound, small_bound; - uint32_t sign_bias, sign_mask, subnormal_bias, off; + uint32_t sign_bias, subnormal_bias, off; } data = { /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of V_POWF_EXP2_N. */ @@ -53,7 +52,6 @@ static const struct data .small_bound = 0x1p-126f, .off = 0x3f35d000, .sign_bias = SignBias, - .sign_mask = 0x80000000, .subnormal_bias = 0x0b800000, /* 23 << 23. */ }; @@ -86,7 +84,7 @@ svisodd (svbool_t pg, svfloat32_t x) static inline svbool_t sv_zeroinfnan (svbool_t pg, svuint32_t i) { - return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1), + return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1), 2u * 0x7f800000 - 1); } @@ -150,9 +148,14 @@ powf_specialcase (float x, float y, float z) } /* Scalar fallback for special case routines with custom signature. */ -static inline svfloat32_t -sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp) +static svfloat32_t NOINLINE +sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y) { + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1)); + svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2)); + svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial); + svbool_t p = svpfirst (cmp, svpfalse ()); while (svptest_any (cmp, p)) { @@ -182,30 +185,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k, /* Polynomial to approximate log1p(r)/ln2. */ svfloat64_t logx = A (0); - logx = svmla_x (pg, A (1), r, logx); - logx = svmla_x (pg, A (2), r, logx); - logx = svmla_x (pg, A (3), r, logx); - logx = svmla_x (pg, y0, r, logx); + logx = svmad_x (pg, r, logx, A (1)); + logx = svmad_x (pg, r, logx, A (2)); + logx = svmad_x (pg, r, logx, A (3)); + logx = svmad_x (pg, r, logx, y0); *pylogx = svmul_x (pg, y, logx); /* z - kd is in [-1, 1] in non-nearest rounding modes. */ - svfloat64_t kd = svadd_x (pg, *pylogx, Shift); - svuint64_t ki = svreinterpret_u64 (kd); - kd = svsub_x (pg, kd, Shift); + svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx); + svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd)); r = svsub_x (pg, *pylogx, kd); /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ - svuint64_t t - = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1)); - svuint64_t ski = svadd_x (pg, ki, sign_bias); - t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS)); + svuint64_t t = svld1_gather_index ( + svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1)); + svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias); + t = svadd_x (svptrue_b64 (), t, + svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS)); svfloat64_t s = svreinterpret_f64 (t); svfloat64_t p = C (0); p = svmla_x (pg, C (1), p, r); p = svmla_x (pg, C (2), p, r); - p = svmla_x (pg, s, p, svmul_x (pg, s, r)); + p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r)); return p; } @@ -219,19 +222,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, { const svbool_t ptrue = svptrue_b64 (); - /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in - order to perform core computation in double precision. */ + /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two + * in order to perform core computation in double precision. */ const svbool_t pg_lo = svunpklo (pg); const svbool_t pg_hi = svunpkhi (pg); - svfloat64_t y_lo = svcvt_f64_x ( - ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); - svfloat64_t y_hi = svcvt_f64_x ( - ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); - svfloat32_t z = svreinterpret_f32 (iz); - svfloat64_t z_lo = svcvt_f64_x ( - ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z)))); - svfloat64_t z_hi = svcvt_f64_x ( - ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z)))); + svfloat64_t y_lo + = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); + svfloat64_t y_hi + = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); + svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz))); + svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz))); svuint64_t i_lo = svunpklo (i); svuint64_t i_hi = svunpkhi (i); svint64_t k_lo = svunpklo (k); @@ -258,9 +258,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, /* Implementation of SVE powf. Provides the same accuracy as AdvSIMD powf, since it relies on the same algorithm. The theoretical maximum error is under 2.60 ULPs. - Maximum measured error is 2.56 ULPs: - SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127 - want 0x1.fd4b06p+127. */ + Maximum measured error is 2.57 ULPs: + SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127 + want 0x1.fff862p+127. */ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) { const struct data *d = ptr_barrier (&data); @@ -269,21 +269,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) svuint32_t viy0 = svreinterpret_u32 (y); /* Negative x cases. */ - svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask); - svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask); + svbool_t xisneg = svcmplt (pg, x, sv_f32 (0)); /* Set sign_bias and ix depending on sign of x and nature of y. */ - svbool_t yisnotint_xisneg = svpfalse_b (); + svbool_t yint_or_xpos = pg; svuint32_t sign_bias = sv_u32 (0); svuint32_t vix = vix0; if (__glibc_unlikely (svptest_any (pg, xisneg))) { /* Determine nature of y. */ - yisnotint_xisneg = svisnotint (xisneg, y); - svbool_t yisint_xisneg = svisint (xisneg, y); + yint_or_xpos = svisint (xisneg, y); svbool_t yisodd_xisneg = svisodd (xisneg, y); /* ix set to abs(ix) if y is integer. */ - vix = svand_m (yisint_xisneg, vix0, 0x7fffffff); + vix = svand_m (yint_or_xpos, vix0, 0x7fffffff); /* Set to SignBias if x is negative and y is odd. */ sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0)); } @@ -294,8 +292,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) svbool_t cmp = svorr_z (pg, xspecial, yspecial); /* Small cases of x: |x| < 0x1p-126. */ - svbool_t xsmall = svaclt (pg, x, d->small_bound); - if (__glibc_unlikely (svptest_any (pg, xsmall))) + svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound); + if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall))) { /* Normalize subnormal x so exponent becomes negative. */ svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm)); @@ -304,32 +302,35 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) vix = svsel (xsmall, vix_norm, vix); } /* Part of core computation carried in working precision. */ - svuint32_t tmp = svsub_x (pg, vix, d->off); - svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), - V_POWF_LOG2_N - 1); - svuint32_t top = svand_x (pg, tmp, 0xff800000); - svuint32_t iz = svsub_x (pg, vix, top); - svint32_t k - = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS)); - - /* Compute core in extended precision and return intermediate ylogx results to - handle cases of underflow and underflow in exp. */ + svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off); + svuint32_t i = svand_x ( + yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), + V_POWF_LOG2_N - 1); + svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000); + svuint32_t iz = svsub_x (yint_or_xpos, vix, top); + svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top), + (23 - V_POWF_EXP2_TABLE_BITS)); + + /* Compute core in extended precision and return intermediate ylogx results + * to handle cases of underflow and underflow in exp. */ svfloat32_t ylogx; - svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d); + svfloat32_t ret + = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d); /* Handle exp special cases of underflow and overflow. */ - svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); + svuint32_t sign + = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); svfloat32_t ret_oflow - = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY))); + = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY))); svfloat32_t ret_uflow = svreinterpret_f32 (sign); - ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret); - ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret); + ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret); + ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret); /* Cases of finite y and finite negative x. */ - ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret); + ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf (""))); - if (__glibc_unlikely (svptest_any (pg, cmp))) - return sv_call_powf_sc (x, y, ret, cmp); + if (__glibc_unlikely (svptest_any (cmp, cmp))) + return sv_call_powf_sc (x, y, ret); return ret; } diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c index 963453f81..072ba8fca 100644 --- a/sysdeps/aarch64/fpu/sinh_sve.c +++ b/sysdeps/aarch64/fpu/sinh_sve.c @@ -18,90 +18,153 @@ . */ #include "sv_math.h" -#include "poly_sve_f64.h" static const struct data { - float64_t poly[11]; - float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift; uint64_t halff; - int64_t onef; - uint64_t large_bound; + double c2, c4; + double inv_ln2; + double ln2_hi, ln2_lo; + double c0, c1, c3; + double shift, special_bound, bound; + uint64_t expm1_data[20]; } data = { - /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ - .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, - 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, - 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16, - 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, - 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, - - .inv_ln2 = 0x1.71547652b82fep0, - .m_ln2_hi = -0x1.62e42fefa39efp-1, - .m_ln2_lo = -0x1.abc9e3b39803fp-56, - .shift = 0x1.8p52, - + /* Table lookup of 2^(i/64) - 1, for values of i from 0..19. */ + .expm1_data = { + 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901, + 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb, + 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2, + 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a, + 0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7, + }, + + /* Generated using Remez, in [-log(2)/128, log(2)/128]. */ + .c0 = 0x1p-1, + .c1 = 0x1.55555555548f9p-3, + .c2 = 0x1.5555555554c22p-5, + .c3 = 0x1.111123aaa2fb2p-7, + .c4 = 0x1.6c16d77d98e5bp-10, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + .inv_ln2 = 0x1.71547652b82fep+0, + .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */ .halff = 0x3fe0000000000000, - .onef = 0x3ff0000000000000, - /* 2^9. expm1 helper overflows for large input. */ - .large_bound = 0x4080000000000000, + .special_bound = 0x1.62e37e7d8ba72p+9, /* ln(2^(1024 - 1/128)). */ + .bound = 0x1.a56ef8ec924ccp-3 /* 19*ln2/64. */ }; +/* A specialised FEXPA expm1 that is only valid for positive inputs and + has no special cases. Based off the full FEXPA expm1 implementated for + _ZGVsMxv_expm1, with a slightly modified file to keep sinh under 3.5ULP. */ static inline svfloat64_t -expm1_inline (svfloat64_t x, svbool_t pg) +expm1_inline (svbool_t pg, svfloat64_t x) { const struct data *d = ptr_barrier (&data); - /* Reduce argument: - exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 - where i = round(x / ln2) - and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ - svfloat64_t j - = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift); - svint64_t i = svcvt_s64_x (pg, j); - svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi); - f = svmla_x (pg, f, j, d->m_ln2_lo); - /* Approximate expm1(f) using polynomial. */ - svfloat64_t f2 = svmul_x (pg, f, f); - svfloat64_t f4 = svmul_x (pg, f2, f2); - svfloat64_t f8 = svmul_x (pg, f4, f4); - svfloat64_t p - = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly)); - /* t = 2^i. */ - svfloat64_t t = svscale_x (pg, sv_f64 (1), i); - /* expm1(x) ~= p * t + (t - 1). */ - return svmla_x (pg, svsub_x (pg, t, 1.0), p, t); + svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); + svuint64_t u = svreinterpret_u64 (z); + svfloat64_t n = svsub_x (pg, z, d->shift); + + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); + svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); + + svfloat64_t r = x; + r = svmls_lane (r, n, ln2, 0); + r = svmls_lane (r, n, ln2, 1); + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + + svfloat64_t p; + svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); + svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); + p = svmad_x (pg, c34, r2, c12); + p = svmad_x (pg, p, r, sv_f64 (d->c0)); + p = svmad_x (pg, p, r2, r); + + svfloat64_t scale = svexpa (u); + + /* We want to construct expm1(x) = (scale - 1) + scale * poly. + However, for values of scale close to 1, scale-1 causes large ULP errors + due to cancellation. + + This can be circumvented by using a small lookup for scale-1 + when our input is below a certain bound, otherwise we can use FEXPA. */ + svbool_t is_small = svaclt (pg, x, d->bound); + + /* Index via the input of FEXPA, but we only care about the lower 5 bits. */ + svuint64_t base_idx = svand_x (pg, u, 0x1f); + + /* Compute scale - 1 from FEXPA, and lookup values where this fails. */ + svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0)); + svuint64_t scalem1_lookup + = svld1_gather_index (is_small, d->expm1_data, base_idx); + + /* Select the appropriate scale - 1 value based on x. */ + svfloat64_t scalem1 + = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate); + + /* return expm1 = scale - 1 + (scale * poly). */ + return svmla_x (pg, scalem1, scale, p); } +/* Vectorised special case to handle values past where exp_inline overflows. + Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double + the valid range of inputs, and returns inf for anything past that. */ static svfloat64_t NOINLINE -special_case (svfloat64_t x, svbool_t pg) +special_case (svbool_t pg, svbool_t special, svfloat64_t ax, + svfloat64_t halfsign, const struct data *d) { - return sv_call_f64 (sinh, x, x, pg); + /* Halves input value, and then check if any cases + are still going to overflow. */ + ax = svmul_x (special, ax, 0.5); + svbool_t is_safe = svaclt (special, ax, d->special_bound); + + svfloat64_t t = expm1_inline (pg, ax); + + /* Finish fastpass to compute values for non-special cases. */ + svfloat64_t y = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0))); + y = svmul_x (pg, y, halfsign); + + /* Computes special lane, and set remaining overflow lanes to inf. */ + svfloat64_t half_special_y = svmul_x (svptrue_b64 (), t, halfsign); + svfloat64_t special_y = svmul_x (svptrue_b64 (), half_special_y, t); + + svuint64_t signed_inf + = svorr_x (svptrue_b64 (), svreinterpret_u64 (halfsign), + sv_u64 (0x7ff0000000000000)); + special_y = svsel (is_safe, special_y, svreinterpret_f64 (signed_inf)); + + /* Join resulting vectors together and return. */ + return svsel (special, special_y, y); } -/* Approximation for SVE double-precision sinh(x) using expm1. - sinh(x) = (exp(x) - exp(-x)) / 2. - The greatest observed error is 2.57 ULP: - _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2 - want 0x1.ab929fc64bd63p-2. */ +/* Approximation for SVE double-precision sinh(x) using FEXPA expm1. + Uses sinh(x) = e^2x - 1 / 2e^x, rewritten for accuracy. + The greatest observed error in the non-special region is 2.63 + 0.5 ULP: + _ZGVsMxv_sinh (0x1.b5e0e13ba88aep-2) got 0x1.c3587faf97b0cp-2 + want 0x1.c3587faf97b09p-2 + + The greatest observed error in the special region is 2.65 + 0.5 ULP: + _ZGVsMxv_sinh (0x1.633ce847dab1ap+9) got 0x1.fffd30eea0066p+1023 + want 0x1.fffd30eea0063p+1023. */ svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg) { const struct data *d = ptr_barrier (&data); + svbool_t special = svacge (pg, x, d->special_bound); svfloat64_t ax = svabs_x (pg, x); svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax)); svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff)); - svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound); - /* Fall back to scalar variant for all lanes if any are special. */ if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, pg); + return special_case (pg, special, ax, halfsign, d); /* Up to the point that expm1 overflows, we can use it to calculate sinh using a slight rearrangement of the definition of sinh. This allows us to retain acceptable accuracy for very small inputs. */ - svfloat64_t t = expm1_inline (ax, pg); + svfloat64_t t = expm1_inline (pg, ax); t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0))); return svmul_x (pg, t, halfsign); } diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h index f208d3389..e2d2e906b 100644 --- a/sysdeps/aarch64/fpu/sv_expf_inline.h +++ b/sysdeps/aarch64/fpu/sv_expf_inline.h @@ -24,52 +24,41 @@ struct sv_expf_data { - float c1, c3, inv_ln2; - float ln2_lo, c0, c2, c4; - float ln2_hi, shift; + float ln2_hi, ln2_lo, c1, null; + float inv_ln2, shift; }; -/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for - compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */ +/* Shift is 1.5*2^17 + 127. */ #define SV_EXPF_DATA \ { \ - /* Coefficients copied from the polynomial in AdvSIMD variant. */ \ - .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \ - .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \ - .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ - .shift = 0x1.803f8p17f, \ + .c1 = 0.5f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ + .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \ } -#define C(i) sv_f32 (d->poly[i]) - static inline svfloat32_t expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) { /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo); + svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_hi); /* n = round(x/(ln2/N)). */ svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift); svfloat32_t n = svsub_x (pg, z, d->shift); /* r = x - n*ln2/N. */ - svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x); + svfloat32_t r = x; r = svmls_lane (r, n, lane_consts, 0); + r = svmls_lane (r, n, lane_consts, 1); /* scale = 2^(n/N). */ svfloat32_t scale = svexpa (svreinterpret_u32 (z)); - /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ - svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); - svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); + /* poly(r) = exp(r) - 1 ~= r + 0.5 r^2. */ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); - svfloat32_t p14 = svmla_x (pg, p12, p34, r2); - svfloat32_t p0 = svmul_lane (r, lane_consts, 1); - svfloat32_t poly = svmla_x (pg, p0, r2, p14); + svfloat32_t poly = svmla_lane (r, r2, lane_consts, 2); return svmla_x (pg, scale, scale, poly); } - #endif diff --git a/sysdeps/aarch64/fpu/sv_log1p_inline.h b/sysdeps/aarch64/fpu/sv_log1p_inline.h index 71f88e02d..c2b196f35 100644 --- a/sysdeps/aarch64/fpu/sv_log1p_inline.h +++ b/sysdeps/aarch64/fpu/sv_log1p_inline.h @@ -21,11 +21,12 @@ #define AARCH64_FPU_SV_LOG1P_INLINE_H #include "sv_math.h" -#include "poly_sve_f64.h" static const struct sv_log1p_data { - double poly[19], ln2[2]; + double c0, c2, c4, c6, c8, c10, c12, c14, c16; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18; + double ln2_lo, ln2_hi; uint64_t hf_rt2_top; uint64_t one_m_hf_rt2_top; uint32_t bottom_mask; @@ -33,15 +34,30 @@ static const struct sv_log1p_data } sv_log1p_data = { /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ - .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, - 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, - -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, - 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, - -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, - 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, - -0x1.cfa7385bdb37ep-6 }, - .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, + .c0 = -0x1.ffffffffffffbp-2, + .c1 = 0x1.55555555551a9p-2, + .c2 = -0x1.00000000008e3p-2, + .c3 = 0x1.9999999a32797p-3, + .c4 = -0x1.555555552fecfp-3, + .c5 = 0x1.249248e071e5ap-3, + .c6 = -0x1.ffffff8bf8482p-4, + .c7 = 0x1.c71c8f07da57ap-4, + .c8 = -0x1.9999ca4ccb617p-4, + .c9 = 0x1.7459ad2e1dfa3p-4, + .c10 = -0x1.554d2680a3ff2p-4, + .c11 = 0x1.3b4c54d487455p-4, + .c12 = -0x1.2548a9ffe80e6p-4, + .c13 = 0x1.0f389a24b2e07p-4, + .c14 = -0x1.eee4db15db335p-5, + .c15 = 0x1.e95b494d4a5ddp-5, + .c16 = -0x1.15fdf07cb7c73p-4, + .c17 = 0x1.0310b70800fcfp-4, + .c18 = -0x1.cfa7385bdb37ep-6, + .ln2_lo = 0x1.62e42fefa3800p-1, + .ln2_hi = 0x1.ef35793c76730p-45, + /* top32(asuint64(sqrt(2)/2)) << 32. */ .hf_rt2_top = 0x3fe6a09e00000000, + /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */ .one_m_hf_rt2_top = 0x00095f6200000000, .bottom_mask = 0xffffffff, .one_top = 0x3ff @@ -51,14 +67,14 @@ static inline svfloat64_t sv_log1p_inline (svfloat64_t x, const svbool_t pg) { /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which - differs from v_log1p_2u5.c by: + differs from advsimd/log1p.c by: - No special-case handling - this should be dealt with by the caller. - Pairwise Horner polynomial evaluation for improved accuracy. - Optionally simulate the shortcut for k=0, used in the scalar routine, using svsel, for improved accuracy when the argument to log1p is close to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1 in the source of the caller before including this file. - See sv_log1p_2u1.c for details of the algorithm. */ + See sve/log1p.c for details of the algorithm. */ const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data); svfloat64_t m = svadd_x (pg, x, 1); svuint64_t mi = svreinterpret_u64 (m); @@ -79,7 +95,7 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg) svfloat64_t cm; #ifndef WANT_SV_LOG1P_K0_SHORTCUT -#error \ +#error \ "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" #elif WANT_SV_LOG1P_K0_SHORTCUT /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is @@ -96,14 +112,46 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg) #endif /* Approximate log1p(f) on the reduced input using a polynomial. */ - svfloat64_t f2 = svmul_x (pg, f, f); - svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly); + svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f), + f4 = svmul_x (svptrue_b64 (), f2, f2), + f8 = svmul_x (svptrue_b64 (), f4, f4), + f16 = svmul_x (svptrue_b64 (), f8, f8); + + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); + svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5); + svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9); + svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13); + svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17); + + /* Order-18 Estrin scheme. */ + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1); + + svfloat64_t p03 = svmla_x (pg, p01, f2, p23); + svfloat64_t p47 = svmla_x (pg, p45, f2, p67); + svfloat64_t p07 = svmla_x (pg, p03, f4, p47); + + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1); + svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0); + svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1); + + svfloat64_t p811 = svmla_x (pg, p89, f2, p1011); + svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415); + svfloat64_t p815 = svmla_x (pg, p811, f4, p1215); + + svfloat64_t p015 = svmla_x (pg, p07, f8, p815); + svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0); + svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1); + svfloat64_t p = svmla_x (pg, p015, f16, p1618); /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ - svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]); - svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]); + svfloat64_t ln2_lo_hi = svld1rq (svptrue_b64 (), &d->ln2_lo); + svfloat64_t ylo = svmla_lane (cm, k, ln2_lo_hi, 0); + svfloat64_t yhi = svmla_lane (f, k, ln2_lo_hi, 1); - return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p); + return svmad_x (pg, p, f2, svadd_x (pg, ylo, yhi)); } - #endif diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c index 789cc6854..586941901 100644 --- a/sysdeps/aarch64/fpu/tanh_sve.c +++ b/sysdeps/aarch64/fpu/tanh_sve.c @@ -18,83 +18,117 @@ . */ #include "sv_math.h" -#include "poly_sve_f64.h" static const struct data { - float64_t poly[11]; - float64_t inv_ln2, ln2_hi, ln2_lo, shift; - uint64_t thresh, tiny_bound; + double ln2_hi, ln2_lo; + double c2, c4; + double c0, c1, c3; + double two_over_ln2, shift; + uint64_t tiny_bound; + double large_bound, fexpa_bound; + uint64_t e2xm1_data[20]; } data = { - /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ - .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5, - 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, - 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16, - 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22, - 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, }, - - .inv_ln2 = 0x1.71547652b82fep0, - .ln2_hi = -0x1.62e42fefa39efp-1, - .ln2_lo = -0x1.abc9e3b39803fp-56, - .shift = 0x1.8p52, - + /* Generated using Remez, in [-log(2)/128, log(2)/128]. */ + .c0 = 0x1p-1, + .c1 = 0x1.55555555548f9p-3, + .c2 = 0x1.5555555554c22p-5, + .c3 = 0x1.111123aaa2fb2p-7, + .c4 = 0x1.6c16d77d98e5bp-10, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + .two_over_ln2 = 0x1.71547652b82fep+1, + .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */ .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27). */ - /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ - .thresh = 0x01f241bf835f9d5f, + .large_bound = 0x1.30fc1931f09cap+4, /* arctanh(1 - 2^-54). */ + .fexpa_bound = 0x1.a56ef8ec924ccp-4, /* 19/64 * ln2/2. */ + /* Table lookup of 2^(i/64) - 1, for values of i from 0..19. */ + .e2xm1_data = { + 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901, + 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb, + 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2, + 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a, + 0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7, + }, }; +/* An expm1 inspired, FEXPA based helper function that returns an + accurate estimate for e^2x - 1. With no special case or support for + negative inputs of x. */ static inline svfloat64_t -expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d) -{ - /* Helper routine for calculating exp(x) - 1. Vector port of the helper from - the scalar variant of tanh. */ - - /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ - svfloat64_t j - = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift); - svint64_t i = svcvt_s64_x (pg, j); - svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi); - f = svmla_x (pg, f, j, d->ln2_lo); - - /* Approximate expm1(f) using polynomial. */ - svfloat64_t f2 = svmul_x (pg, f, f); - svfloat64_t f4 = svmul_x (pg, f2, f2); - svfloat64_t p = svmla_x ( - pg, f, f2, - sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly)); - - /* t = 2 ^ i. */ - svfloat64_t t = svscale_x (pg, sv_f64 (1), i); - /* expm1(x) = p * t + (t - 1). */ - return svmla_x (pg, svsub_x (pg, t, 1), p, t); -} - -static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +e2xm1_inline (const svbool_t pg, svfloat64_t x, const struct data *d) { - return sv_call_f64 (tanh, x, y, special); + svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->two_over_ln2); + svuint64_t u = svreinterpret_u64 (z); + svfloat64_t n = svsub_x (pg, z, d->shift); + + /* r = x - n * ln2/2, r is in [-ln2/(2N), ln2/(2N)]. */ + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); + svfloat64_t r = svadd_x (pg, x, x); + r = svmls_lane (r, n, ln2, 0); + r = svmls_lane (r, n, ln2, 1); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); + + svfloat64_t p; + svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); + svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); + p = svmad_x (pg, c34, r2, c12); + p = svmad_x (pg, p, r, sv_f64 (d->c0)); + p = svmad_x (pg, p, r2, r); + + svfloat64_t scale = svexpa (u); + + /* We want to construct e2xm1(x) = (scale - 1) + scale * poly. + However, for values of scale close to 1, scale-1 causes large ULP errors + due to cancellation. + + This can be circumvented by using a small lookup for scale-1 + when our input is below a certain bound, otherwise we can use FEXPA. */ + svbool_t is_small = svaclt (pg, x, d->fexpa_bound); + + /* Index via the input of FEXPA, but we only care about the lower 5 bits. */ + svuint64_t base_idx = svand_x (pg, u, 0x1f); + + /* Compute scale - 1 from FEXPA, and lookup values where this fails. */ + svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0)); + svuint64_t scalem1_lookup + = svld1_gather_index (is_small, d->e2xm1_data, base_idx); + + /* Select the appropriate scale - 1 value based on x. */ + svfloat64_t scalem1 + = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate); + return svmla_x (pg, scalem1, scale, p); } -/* SVE approximation for double-precision tanh(x), using a simplified - version of expm1. The greatest observed error is 2.77 ULP: - _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 - want -0x1.bd6a21a163624p-3. */ +/* SVE approximation for double-precision tanh(x), using a modified version of + FEXPA expm1 to calculate e^2x - 1. + The greatest observed error is 2.79 + 0.5 ULP: + _ZGVsMxv_tanh (0x1.fff868eb3c223p-9) got 0x1.fff7be486cae6p-9 + want 0x1.fff7be486cae9p-9. */ svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg) { const struct data *d = ptr_barrier (&data); - svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x)); + svbool_t large = svacge (pg, x, d->large_bound); - /* Trigger special-cases for tiny, boring and infinity/NaN. */ - svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh); + /* We can use tanh(x) = (e^2x - 1) / (e^2x + 1) to approximate tanh. + As an additional optimisation, we can ensure more accurate values of e^x + by only using positive inputs. So we calculate tanh(|x|), and restore the + sign of the input before returning. */ + svfloat64_t ax = svabs_x (pg, x); + svuint64_t sign_bit + = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax)); - svfloat64_t u = svadd_x (pg, x, x); + svfloat64_t p = e2xm1_inline (pg, ax, d); + svfloat64_t q = svadd_x (pg, p, 2); - /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ - svfloat64_t q = expm1_inline (u, pg, d); - svfloat64_t qp2 = svadd_x (pg, q, 2); + /* For sufficiently high inputs, the result of tanh(|x|) is 1 when correctly + rounded, at this point we can return 1 directly, with sign correction. + This will also act as a guard against our approximation overflowing. */ + svfloat64_t y = svsel (large, sv_f64 (1.0), svdiv_x (pg, p, q)); - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svdiv_x (pg, q, qp2), special); - return svdiv_x (pg, q, qp2); + return svreinterpret_f64 (svorr_x (pg, sign_bit, svreinterpret_u64 (y))); } diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 772b16a35..1c3c39251 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -14,6 +14,7 @@ sysdep_routines += \ memset_kunpeng \ memset_mops \ memset_oryon1 \ + memset_sve_zva64 \ memset_zva64 \ strlen_asimd \ strlen_generic \ diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index 0481e450b..8dc314b67 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -57,6 +57,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx) + IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64) #endif IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c index f6194e4a9..872f39f00 100644 --- a/sysdeps/aarch64/multiarch/memset.c +++ b/sysdeps/aarch64/multiarch/memset.c @@ -36,6 +36,7 @@ extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden; extern __typeof (__redirect_memset) __memset_generic attribute_hidden; extern __typeof (__redirect_memset) __memset_mops attribute_hidden; extern __typeof (__redirect_memset) __memset_oryon1 attribute_hidden; +extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden; static inline __typeof (__redirect_memset) * select_memset_ifunc (void) @@ -49,6 +50,9 @@ select_memset_ifunc (void) { if (IS_A64FX (midr) && zva_size == 256) return __memset_a64fx; + + if (prefer_sve_ifuncs && zva_size == 64) + return __memset_sve_zva64; } if (IS_ORYON1 (midr) && zva_size == 64) diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S new file mode 100644 index 000000000..7fb40fdd9 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S @@ -0,0 +1,123 @@ +/* Optimized memset for SVE. + Copyright (C) 2025 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. + * ZVA size is 64. + */ + +#if HAVE_AARCH64_SVE_ASM + +.arch armv8.2-a+sve + +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define zva_val x5 +#define vlen x5 +#define off x3 +#define dstend2 x5 + +ENTRY (__memset_sve_zva64) + dup v0.16B, valw + cmp count, 16 + b.lo L(set_16) + + add dstend, dstin, count + cmp count, 64 + b.hs L(set_128) + + /* Set 16..63 bytes. */ + mov off, 16 + and off, off, count, lsr 1 + sub dstend2, dstend, off + str q0, [dstin] + str q0, [dstin, off] + str q0, [dstend2, -16] + str q0, [dstend, -16] + ret + + .p2align 4 +L(set_16): + whilelo p0.b, xzr, count + st1b z0.b, p0, [dstin] + ret + + .p2align 4 +L(set_128): + bic dst, dstin, 15 + cmp count, 128 + b.hi L(set_long) + stp q0, q0, [dstin] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + + .p2align 4 +L(set_long): + cmp count, 256 + b.lo L(no_zva) + tst valw, 255 + b.ne L(no_zva) + + str q0, [dstin] + str q0, [dst, 16] + bic dst, dstin, 31 + stp q0, q0, [dst, 32] + bic dst, dstin, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + sub x8, dstend, 1 /* Write last bytes before ZVA loop. */ + bic x8, x8, 15 + stp q0, q0, [x8, -48] + str q0, [x8, -16] + str q0, [dstend, -16] + + .p2align 4 +L(zva64_loop): + add dst, dst, 64 + dc zva, dst + subs count, count, 64 + b.hi L(zva64_loop) + ret + +L(no_zva): + str q0, [dstin] + sub count, dstend, dst /* Count is 16 too large. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +L(no_zva_loop): + stp q0, q0, [dst, 16] + stp q0, q0, [dst, 48] + add dst, dst, 64 + subs count, count, 64 + b.hi L(no_zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +END (__memset_sve_zva64) +#endif diff --git a/sysdeps/arm/find_exidx.c b/sysdeps/arm/find_exidx.c index 60021a072..468e01621 100644 --- a/sysdeps/arm/find_exidx.c +++ b/sysdeps/arm/find_exidx.c @@ -15,6 +15,7 @@ License along with the GNU C Library. If not, see . */ +#include #include /* Find the exception index table containing PC. */ @@ -23,7 +24,7 @@ _Unwind_Ptr __gnu_Unwind_Find_exidx (_Unwind_Ptr pc, int * pcount) { struct dl_find_object data; - if (__dl_find_object ((void *) pc, &data) < 0) + if (GLRO(dl_find_object) ((void *) pc, &data) < 0) return 0; *pcount = data.dlfo_eh_count; return (_Unwind_Ptr) data.dlfo_eh_frame; diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index e871f27ff..ddb34a158 100644 --- a/sysdeps/generic/ldsodefs.h +++ b/sysdeps/generic/ldsodefs.h @@ -695,10 +695,23 @@ extern const ElfW(Phdr) *_dl_phdr; extern size_t _dl_phnum; #endif +/* Possible values for the glibc.rtld.execstack tunable. */ +enum stack_tunable_mode + { + /* Do not allow executable stacks, even if program requires it. */ + stack_tunable_mode_disable = 0, + /* Follows either ABI requirement, or the PT_GNU_STACK value. */ + stack_tunable_mode_enable = 1, + /* Always enable an executable stack. */ + stack_tunable_mode_force = 2 + }; + +void _dl_handle_execstack_tunable (void) attribute_hidden; + /* This function changes the permission of the memory region pointed by STACK_ENDP to executable and update the internal memory protection flags for future thread stack creation. */ -int _dl_make_stack_executable (void **stack_endp) attribute_hidden; +int _dl_make_stack_executable (const void *stack_endp) attribute_hidden; /* Variable pointing to the end of the stack (or close to it). This value must be constant over the runtime of the application. Some programs diff --git a/sysdeps/ieee754/dbl-64/e_atanh.c b/sysdeps/ieee754/dbl-64/e_atanh.c index 1e09e46f0..d1c71b2aa 100644 --- a/sysdeps/ieee754/dbl-64/e_atanh.c +++ b/sysdeps/ieee754/dbl-64/e_atanh.c @@ -44,6 +44,11 @@ static const double huge = 1e300; +#ifndef SECTION +# define SECTION +#endif + +SECTION double __ieee754_atanh (double x) { @@ -73,4 +78,7 @@ __ieee754_atanh (double x) return copysign (t, x); } + +#ifndef __ieee754_atanh libm_alias_finite (__ieee754_atanh, __atanh) +#endif diff --git a/sysdeps/ieee754/dbl-64/e_sinh.c b/sysdeps/ieee754/dbl-64/e_sinh.c index b4b5857dd..3f787967f 100644 --- a/sysdeps/ieee754/dbl-64/e_sinh.c +++ b/sysdeps/ieee754/dbl-64/e_sinh.c @@ -41,6 +41,11 @@ static char rcsid[] = "$NetBSD: e_sinh.c,v 1.7 1995/05/10 20:46:13 jtc Exp $"; static const double one = 1.0, shuge = 1.0e307; +#ifndef SECTION +# define SECTION +#endif + +SECTION double __ieee754_sinh (double x) { @@ -90,4 +95,7 @@ __ieee754_sinh (double x) /* |x| > overflowthresold, sinh(x) overflow */ return math_narrow_eval (x * shuge); } + +#ifndef __ieee754_sinh libm_alias_finite (__ieee754_sinh, __sinh) +#endif diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h index 299a2ff8c..3382e385f 100644 --- a/sysdeps/ieee754/dbl-64/math_config.h +++ b/sysdeps/ieee754/dbl-64/math_config.h @@ -195,16 +195,18 @@ check_uflow (double x) extern const struct exp_data { double invln2N; - double shift; double negln2hiN; double negln2loN; double poly[4]; /* Last four coefficients. */ + double shift; + double exp2_shift; double exp2_poly[EXP2_POLY_ORDER]; - double invlog10_2N; + double neglog10_2hiN; double neglog10_2loN; double exp10_poly[5]; + double invlog10_2N; uint64_t tab[2*(1 << EXP_TABLE_BITS)]; } __exp_data attribute_hidden; diff --git a/sysdeps/ieee754/dbl-64/s_fma.c b/sysdeps/ieee754/dbl-64/s_fma.c index 20f617b99..42351c6b3 100644 --- a/sysdeps/ieee754/dbl-64/s_fma.c +++ b/sysdeps/ieee754/dbl-64/s_fma.c @@ -244,6 +244,9 @@ __fma (double x, double y, double z) /* Reset rounding mode and test for inexact simultaneously. */ int j = libc_feupdateenv_test (&env, FE_INEXACT) != 0; + /* Ensure value of a1 + u.d is not reused. */ + a1 = math_opt_barrier (a1); + if (__glibc_likely (adjust == 0)) { if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff) diff --git a/sysdeps/ieee754/dbl-64/s_tanh.c b/sysdeps/ieee754/dbl-64/s_tanh.c index 673a97102..13063db04 100644 --- a/sysdeps/ieee754/dbl-64/s_tanh.c +++ b/sysdeps/ieee754/dbl-64/s_tanh.c @@ -46,6 +46,11 @@ static char rcsid[] = "$NetBSD: s_tanh.c,v 1.7 1995/05/10 20:48:22 jtc Exp $"; static const double one = 1.0, two = 2.0, tiny = 1.0e-300; +#ifndef SECTION +# define SECTION +#endif + +SECTION double __tanh (double x) { diff --git a/sysdeps/ieee754/flt-32/e_sinhf.c b/sysdeps/ieee754/flt-32/e_sinhf.c index c007c7d17..dee96fc7c 100644 --- a/sysdeps/ieee754/flt-32/e_sinhf.c +++ b/sysdeps/ieee754/flt-32/e_sinhf.c @@ -83,7 +83,7 @@ __ieee754_sinhf (float x) { /* |x| <= 0x1.250bfep-11 */ if (__glibc_unlikely (ux < 0x66000000u)) /* |x| < 0x1p-24 */ return fmaf (x, fabsf (x), x); - if (__glibc_unlikely (st.uarg == asuint (ux))) + if (__glibc_unlikely (st.uarg == ux)) { float sgn = copysignf (1.0f, x); return sgn * st.rh + sgn * st.rl; diff --git a/sysdeps/ieee754/flt-32/s_log10p1f.c b/sysdeps/ieee754/flt-32/s_log10p1f.c index 64deb1eed..4e11d55d4 100644 --- a/sysdeps/ieee754/flt-32/s_log10p1f.c +++ b/sysdeps/ieee754/flt-32/s_log10p1f.c @@ -70,7 +70,7 @@ __log10p1f (float x) }; static const double tl[] = { - 0x1.562ec497ef351p-43, 0x1.b9476892ea99cp-8, 0x1.b5e909c959eecp-7, + -0x1.562ec497ef351p-43, 0x1.b9476892ea99cp-8, 0x1.b5e909c959eecp-7, 0x1.45f4f59ec84fp-6, 0x1.af5f92cbcf2aap-6, 0x1.0ba01a6069052p-5, 0x1.3ed119b99dd41p-5, 0x1.714834298a088p-5, 0x1.a30a9d98309c1p-5, 0x1.d41d51266b9d9p-5, 0x1.02428c0f62dfcp-4, 0x1.1a23444eea521p-4, diff --git a/sysdeps/ieee754/flt-32/s_tanf.c b/sysdeps/ieee754/flt-32/s_tanf.c index dfe56fc2a..5ee1d6f35 100644 --- a/sysdeps/ieee754/flt-32/s_tanf.c +++ b/sysdeps/ieee754/flt-32/s_tanf.c @@ -166,7 +166,7 @@ __tanf (float x) uint32_t sgn = t >> 31; for (int j = 0; j < array_length (st); j++) { - if (__glibc_unlikely (asfloat (st[j].arg) == ax)) + if (__glibc_unlikely (asuint (st[j].arg) == ax)) { if (sgn) return -st[j].rh - st[j].rl; diff --git a/sysdeps/mach/hurd/dl-execstack.c b/sysdeps/mach/hurd/dl-execstack.c index 0617d3a16..dc4719bd3 100644 --- a/sysdeps/mach/hurd/dl-execstack.c +++ b/sysdeps/mach/hurd/dl-execstack.c @@ -26,12 +26,11 @@ extern struct hurd_startup_data *_dl_hurd_data attribute_hidden; so as to mprotect it. */ int -_dl_make_stack_executable (void **stack_endp) +_dl_make_stack_executable (const void *stack_endp) { /* Challenge the caller. */ - if (__builtin_expect (*stack_endp != __libc_stack_end, 0)) + if (__glibc_unlikely (stack_endp != __libc_stack_end)) return EPERM; - *stack_endp = NULL; #if IS_IN (rtld) if (__mprotect ((void *)_dl_hurd_data->stack_base, _dl_hurd_data->stack_size, diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h index 7c24c0a6b..e614c7f3c 100644 --- a/sysdeps/nptl/bits/thread-shared-types.h +++ b/sysdeps/nptl/bits/thread-shared-types.h @@ -99,6 +99,8 @@ struct __pthread_cond_s unsigned int __g1_orig_size; unsigned int __wrefs; unsigned int __g_signals[2]; + unsigned int __unused_initialized_1; + unsigned int __unused_initialized_2; }; typedef unsigned int __tss_t; diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c index f487bfb66..8629b5d41 100644 --- a/sysdeps/nptl/dl-tls_init_tp.c +++ b/sysdeps/nptl/dl-tls_init_tp.c @@ -23,6 +23,7 @@ #include #include #include +#include #define TUNABLE_NAMESPACE pthread #include diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h index 050b4ab8d..9ad36cabe 100644 --- a/sysdeps/nptl/pthread.h +++ b/sysdeps/nptl/pthread.h @@ -152,7 +152,7 @@ enum /* Conditional variable handling. */ -#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } } +#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0}, 0, 0 } } /* Cleanup buffers */ diff --git a/sysdeps/powerpc/powerpc64/le/power10/memchr.S b/sysdeps/powerpc/powerpc64/le/power10/memchr.S deleted file mode 100644 index 96ad5a2e1..000000000 --- a/sysdeps/powerpc/powerpc64/le/power10/memchr.S +++ /dev/null @@ -1,315 +0,0 @@ -/* Optimized memchr implementation for POWER10 LE. - Copyright (C) 2021-2025 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include - -# ifndef MEMCHR -# define MEMCHR __memchr -# endif -# define M_VREG_ZERO v20 -# define M_OFF_START_LOOP 256 -# define MEMCHR_SUBTRACT_VECTORS \ - vsububm v4,v4,v18; \ - vsububm v5,v5,v18; \ - vsububm v6,v6,v18; \ - vsububm v7,v7,v18; -# define M_TAIL(vreg,increment) \ - vctzlsbb r4,vreg; \ - cmpld r5,r4; \ - ble L(null); \ - addi r4,r4,increment; \ - add r3,r6,r4; \ - blr - -/* TODO: Replace macros by the actual instructions when minimum binutils becomes - >= 2.35. This is used to keep compatibility with older versions. */ -#define M_VEXTRACTBM(rt,vrb) \ - .long(((4)<<(32-6)) \ - | ((rt)<<(32-11)) \ - | ((8)<<(32-16)) \ - | ((vrb)<<(32-21)) \ - | 1602) - -#define M_LXVP(xtp,dq,ra) \ - .long(((6)<<(32-6)) \ - | ((((xtp)-32)>>1)<<(32-10)) \ - | ((1)<<(32-11)) \ - | ((ra)<<(32-16)) \ - | dq) - -#define CHECK16B(vreg,offset,addr,label) \ - lxv vreg+32,offset(addr); \ - vcmpequb. vreg,vreg,v18; \ - bne cr6,L(label); \ - cmpldi r5,16; \ - ble L(null); \ - addi r5,r5,-16; - -/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has # - of bytes already checked. */ -#define CHECK64B(offset,addr,label) \ - M_LXVP(v4+32,offset,addr); \ - M_LXVP(v6+32,offset+32,addr); \ - MEMCHR_SUBTRACT_VECTORS; \ - vminub v14,v4,v5; \ - vminub v15,v6,v7; \ - vminub v16,v14,v15; \ - vcmpequb. v0,v16,M_VREG_ZERO; \ - beq cr6,$+12; \ - li r7,offset; \ - b L(label); \ - cmpldi r5,64; \ - ble L(null); \ - addi r5,r5,-64 - -/* Implements the function - void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]). */ - - .machine power9 - -ENTRY_TOCLESS (MEMCHR) - CALL_MCOUNT 3 - - cmpldi r5,0 - beq L(null) - mr r0,r5 - xori r6,r4,0xff - - mtvsrd v18+32,r4 /* matching char in v18 */ - mtvsrd v19+32,r6 /* non matching char in v19 */ - - vspltb v18,v18,7 /* replicate */ - vspltb v19,v19,7 /* replicate */ - vspltisb M_VREG_ZERO,0 - - /* Next 16B-aligned address. Prepare address for L(aligned). */ - addi r6,r3,16 - clrrdi r6,r6,4 - - /* Align data and fill bytes not loaded with non matching char. */ - lvx v0,0,r3 - lvsr v1,0,r3 - vperm v0,v19,v0,v1 - - vcmpequb. v6,v0,v18 - bne cr6,L(found) - sub r4,r6,r3 - cmpld r5,r4 - ble L(null) - sub r5,r5,r4 - - /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is - optimized for longer strings, so checking the first bytes in 16B - chunks benefits a lot small strings. */ - .p2align 5 -L(aligned): - cmpldi r5,0 - beq L(null) - - CHECK16B(v0,0,r6,tail1) - CHECK16B(v1,16,r6,tail2) - CHECK16B(v2,32,r6,tail3) - CHECK16B(v3,48,r6,tail4) - CHECK16B(v4,64,r6,tail5) - CHECK16B(v5,80,r6,tail6) - CHECK16B(v6,96,r6,tail7) - CHECK16B(v7,112,r6,tail8) - CHECK16B(v8,128,r6,tail9) - CHECK16B(v9,144,r6,tail10) - CHECK16B(v10,160,r6,tail11) - CHECK16B(v0,176,r6,tail12) - CHECK16B(v1,192,r6,tail13) - CHECK16B(v2,208,r6,tail14) - CHECK16B(v3,224,r6,tail15) - - cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to - choose how we will perform the main loop. */ - - /* Prepare address for the loop. */ - addi r4,r3,M_OFF_START_LOOP - clrrdi r4,r4,6 - sub r6,r4,r3 - sub r5,r0,r6 - addi r6,r4,128 - - /* If c == 0, use the loop without the vsububm. */ - beq cr5,L(loop) - - /* This is very similar to the block after L(loop), the difference is - that here MEMCHR_SUBTRACT_VECTORS is not empty, and we subtract - each byte loaded by the char we are looking for, this way we can keep - using vminub to merge the results and checking for nulls. */ - .p2align 5 -L(memchr_loop): - CHECK64B(0,r4,pre_tail_64b) - CHECK64B(64,r4,pre_tail_64b) - addi r4,r4,256 - - CHECK64B(0,r6,tail_64b) - CHECK64B(64,r6,tail_64b) - addi r6,r6,256 - - CHECK64B(0,r4,pre_tail_64b) - CHECK64B(64,r4,pre_tail_64b) - addi r4,r4,256 - - CHECK64B(0,r6,tail_64b) - CHECK64B(64,r6,tail_64b) - addi r6,r6,256 - - b L(memchr_loop) - /* Switch to a more aggressive approach checking 64B each time. Use 2 - pointers 128B apart and unroll the loop once to make the pointer - updates and usages separated enough to avoid stalls waiting for - address calculation. */ - .p2align 5 -L(loop): -#undef MEMCHR_SUBTRACT_VECTORS -#define MEMCHR_SUBTRACT_VECTORS /* nothing */ - CHECK64B(0,r4,pre_tail_64b) - CHECK64B(64,r4,pre_tail_64b) - addi r4,r4,256 - - CHECK64B(0,r6,tail_64b) - CHECK64B(64,r6,tail_64b) - addi r6,r6,256 - - CHECK64B(0,r4,pre_tail_64b) - CHECK64B(64,r4,pre_tail_64b) - addi r4,r4,256 - - CHECK64B(0,r6,tail_64b) - CHECK64B(64,r6,tail_64b) - addi r6,r6,256 - - b L(loop) - - .p2align 5 -L(pre_tail_64b): - mr r6,r4 -L(tail_64b): - /* OK, we found a null byte. Let's look for it in the current 64-byte - block and mark it in its corresponding VR. lxvp vx,0(ry) puts the - low 16B bytes into vx+1, and the high into vx, so the order here is - v5, v4, v7, v6. */ - vcmpequb v1,v5,M_VREG_ZERO - vcmpequb v2,v4,M_VREG_ZERO - vcmpequb v3,v7,M_VREG_ZERO - vcmpequb v4,v6,M_VREG_ZERO - - /* Take into account the other 64B blocks we had already checked. */ - add r6,r6,r7 - /* Extract first bit of each byte. */ - M_VEXTRACTBM(r8,v1) - M_VEXTRACTBM(r9,v2) - M_VEXTRACTBM(r10,v3) - M_VEXTRACTBM(r11,v4) - - /* Shift each value into their corresponding position. */ - sldi r9,r9,16 - sldi r10,r10,32 - sldi r11,r11,48 - - /* Merge the results. */ - or r8,r8,r9 - or r9,r10,r11 - or r11,r9,r8 - - cnttzd r0,r11 /* Count trailing zeros before the match. */ - cmpld r5,r0 - ble L(null) - add r3,r6,r0 /* Compute final address. */ - blr - - .p2align 5 -L(tail1): - M_TAIL(v0,0) - - .p2align 5 -L(tail2): - M_TAIL(v1,16) - - .p2align 5 -L(tail3): - M_TAIL(v2,32) - - .p2align 5 -L(tail4): - M_TAIL(v3,48) - - .p2align 5 -L(tail5): - M_TAIL(v4,64) - - .p2align 5 -L(tail6): - M_TAIL(v5,80) - - .p2align 5 -L(tail7): - M_TAIL(v6,96) - - .p2align 5 -L(tail8): - M_TAIL(v7,112) - - .p2align 5 -L(tail9): - M_TAIL(v8,128) - - .p2align 5 -L(tail10): - M_TAIL(v9,144) - - .p2align 5 -L(tail11): - M_TAIL(v10,160) - - .p2align 5 -L(tail12): - M_TAIL(v0,176) - - .p2align 5 -L(tail13): - M_TAIL(v1,192) - - .p2align 5 -L(tail14): - M_TAIL(v2,208) - - .p2align 5 -L(tail15): - M_TAIL(v3,224) - - .p2align 5 -L(found): - vctzlsbb r7,v6 - cmpld r5,r7 - ble L(null) - add r3,r3,r7 - blr - - .p2align 5 -L(null): - li r3,0 - blr - -END (MEMCHR) - -weak_alias (__memchr, memchr) -libc_hidden_builtin_def (memchr) diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S deleted file mode 100644 index fffa1ee0a..000000000 --- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S +++ /dev/null @@ -1,233 +0,0 @@ -/* Optimized strcmp implementation for PowerPC64/POWER10. - Copyright (C) 2021-2025 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ -#include - -#ifndef STRCMP -# define STRCMP strcmp -#endif - -/* Implements the function - int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]). */ - -/* TODO: Change this to actual instructions when minimum binutils is upgraded - to 2.27. Macros are defined below for these newer instructions in order - to maintain compatibility. */ - -#define LXVP(xtp,dq,ra) \ - .long(((6)<<(32-6)) \ - | ((((xtp)-32)>>1)<<(32-10)) \ - | ((1)<<(32-11)) \ - | ((ra)<<(32-16)) \ - | dq) - -#define COMPARE_16(vreg1,vreg2,offset) \ - lxv vreg1+32,offset(r3); \ - lxv vreg2+32,offset(r4); \ - vcmpnezb. v7,vreg1,vreg2; \ - bne cr6,L(different); \ - -#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \ - LXVP(vreg1+32,offset,r3); \ - LXVP(vreg2+32,offset,r4); \ - vcmpnezb. v7,vreg1+1,vreg2+1; \ - bne cr6,L(label1); \ - vcmpnezb. v7,vreg1,vreg2; \ - bne cr6,L(label2); \ - -#define TAIL(vreg1,vreg2) \ - vctzlsbb r6,v7; \ - vextubrx r5,r6,vreg1; \ - vextubrx r4,r6,vreg2; \ - subf r3,r4,r5; \ - blr; \ - -#define CHECK_N_BYTES(reg1,reg2,len_reg) \ - sldi r0,len_reg,56; \ - lxvl 32+v4,reg1,r0; \ - lxvl 32+v5,reg2,r0; \ - add reg1,reg1,len_reg; \ - add reg2,reg2,len_reg; \ - vcmpnezb v7,v4,v5; \ - vctzlsbb r6,v7; \ - cmpld cr7,r6,len_reg; \ - blt cr7,L(different); \ - - /* TODO: change this to .machine power10 when the minimum required - binutils allows it. */ - - .machine power9 -ENTRY_TOCLESS (STRCMP, 4) - andi. r7,r3,4095 - andi. r8,r4,4095 - cmpldi cr0,r7,4096-16 - cmpldi cr1,r8,4096-16 - bgt cr0,L(crosses) - bgt cr1,L(crosses) - COMPARE_16(v4,v5,0) - -L(crosses): - andi. r7,r3,15 - subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */ - andi. r9,r4,15 - subfic r5,r9,16 /* r5(nalign2) = 16 - (str2 & 15). */ - cmpld cr7,r7,r5 - beq cr7,L(same_aligned) - blt cr7,L(nalign1_min) - - /* nalign2 is minimum and s2 pointer is aligned. */ - CHECK_N_BYTES(r3,r4,r5) - /* Are we on the 64B hunk which crosses a page? */ - andi. r10,r3,63 /* Determine offset into 64B hunk. */ - andi. r8,r3,15 /* The offset into the 16B hunk. */ - neg r7,r3 - andi. r9,r7,15 /* Number of bytes after a 16B cross. */ - rlwinm. r7,r7,26,0x3F /* ((r3-4096))>>6&63. */ - beq L(compare_64_pagecross) - mtctr r7 - b L(compare_64B_unaligned) - - /* nalign1 is minimum and s1 pointer is aligned. */ -L(nalign1_min): - CHECK_N_BYTES(r3,r4,r7) - /* Are we on the 64B hunk which crosses a page? */ - andi. r10,r4,63 /* Determine offset into 64B hunk. */ - andi. r8,r4,15 /* The offset into the 16B hunk. */ - neg r7,r4 - andi. r9,r7,15 /* Number of bytes after a 16B cross. */ - rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */ - beq L(compare_64_pagecross) - mtctr r7 - - .p2align 5 -L(compare_64B_unaligned): - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - COMPARE_16(v4,v5,48) - addi r3,r3,64 - addi r4,r4,64 - bdnz L(compare_64B_unaligned) - - /* Cross the page boundary of s2, carefully. Only for first - iteration we have to get the count of 64B blocks to be checked. - From second iteration and beyond, loop counter is always 63. */ -L(compare_64_pagecross): - li r11, 63 - mtctr r11 - cmpldi r10,16 - ble L(cross_4) - cmpldi r10,32 - ble L(cross_3) - cmpldi r10,48 - ble L(cross_2) -L(cross_1): - CHECK_N_BYTES(r3,r4,r9) - CHECK_N_BYTES(r3,r4,r8) - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - addi r3,r3,48 - addi r4,r4,48 - b L(compare_64B_unaligned) -L(cross_2): - COMPARE_16(v4,v5,0) - addi r3,r3,16 - addi r4,r4,16 - CHECK_N_BYTES(r3,r4,r9) - CHECK_N_BYTES(r3,r4,r8) - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - addi r3,r3,32 - addi r4,r4,32 - b L(compare_64B_unaligned) -L(cross_3): - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - addi r3,r3,32 - addi r4,r4,32 - CHECK_N_BYTES(r3,r4,r9) - CHECK_N_BYTES(r3,r4,r8) - COMPARE_16(v4,v5,0) - addi r3,r3,16 - addi r4,r4,16 - b L(compare_64B_unaligned) -L(cross_4): - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - addi r3,r3,48 - addi r4,r4,48 - CHECK_N_BYTES(r3,r4,r9) - CHECK_N_BYTES(r3,r4,r8) - b L(compare_64B_unaligned) - -L(same_aligned): - CHECK_N_BYTES(r3,r4,r7) - /* Align s1 to 32B and adjust s2 address. - Use lxvp only if both s1 and s2 are 32B aligned. */ - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - COMPARE_16(v4,v5,48) - addi r3,r3,64 - addi r4,r4,64 - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - - clrldi r6,r3,59 - subfic r5,r6,32 - add r3,r3,r5 - add r4,r4,r5 - andi. r5,r4,0x1F - beq cr0,L(32B_aligned_loop) - - .p2align 5 -L(16B_aligned_loop): - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - COMPARE_16(v4,v5,48) - addi r3,r3,64 - addi r4,r4,64 - b L(16B_aligned_loop) - - /* Calculate and return the difference. */ -L(different): - TAIL(v4,v5) - - .p2align 5 -L(32B_aligned_loop): - COMPARE_32(v14,v16,0,tail1,tail2) - COMPARE_32(v18,v20,32,tail3,tail4) - COMPARE_32(v22,v24,64,tail5,tail6) - COMPARE_32(v26,v28,96,tail7,tail8) - addi r3,r3,128 - addi r4,r4,128 - b L(32B_aligned_loop) - -L(tail1): TAIL(v15,v17) -L(tail2): TAIL(v14,v16) -L(tail3): TAIL(v19,v21) -L(tail4): TAIL(v18,v20) -L(tail5): TAIL(v23,v25) -L(tail6): TAIL(v22,v24) -L(tail7): TAIL(v27,v29) -L(tail8): TAIL(v26,v28) - -END (STRCMP) -libc_hidden_builtin_def (strcmp) diff --git a/sysdeps/powerpc/powerpc64/le/power10/strncmp.S b/sysdeps/powerpc/powerpc64/le/power10/strncmp.S deleted file mode 100644 index 10700dd40..000000000 --- a/sysdeps/powerpc/powerpc64/le/power10/strncmp.S +++ /dev/null @@ -1,271 +0,0 @@ -/* Optimized strncmp implementation for PowerPC64/POWER10. - Copyright (C) 2024-2025 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include - -/* Implements the function - - int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n) - - The implementation uses unaligned doubleword access to avoid specialized - code paths depending of data alignment for first 32 bytes and uses - vectorised loops after that. */ - -#ifndef STRNCMP -# define STRNCMP strncmp -#endif - -/* TODO: Change this to actual instructions when minimum binutils is upgraded - to 2.27. Macros are defined below for these newer instructions in order - to maintain compatibility. */ - -#define LXVP(xtp,dq,ra) \ - .long(((6)<<(32-6)) \ - | ((((xtp)-32)>>1)<<(32-10)) \ - | ((1)<<(32-11)) \ - | ((ra)<<(32-16)) \ - | dq) - -#define COMPARE_16(vreg1,vreg2,offset) \ - lxv vreg1+32,offset(r3); \ - lxv vreg2+32,offset(r4); \ - vcmpnezb. v7,vreg1,vreg2; \ - bne cr6,L(different); \ - cmpldi cr7,r5,16; \ - ble cr7,L(ret0); \ - addi r5,r5,-16; - -#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \ - LXVP(vreg1+32,offset,r3); \ - LXVP(vreg2+32,offset,r4); \ - vcmpnezb. v7,vreg1+1,vreg2+1; \ - bne cr6,L(label1); \ - vcmpnezb. v7,vreg1,vreg2; \ - bne cr6,L(label2); \ - cmpldi cr7,r5,32; \ - ble cr7,L(ret0); \ - addi r5,r5,-32; - -#define TAIL_FIRST_16B(vreg1,vreg2) \ - vctzlsbb r6,v7; \ - cmpld cr7,r5,r6; \ - ble cr7,L(ret0); \ - vextubrx r5,r6,vreg1; \ - vextubrx r4,r6,vreg2; \ - subf r3,r4,r5; \ - blr; - -#define TAIL_SECOND_16B(vreg1,vreg2) \ - vctzlsbb r6,v7; \ - addi r0,r6,16; \ - cmpld cr7,r5,r0; \ - ble cr7,L(ret0); \ - vextubrx r5,r6,vreg1; \ - vextubrx r4,r6,vreg2; \ - subf r3,r4,r5; \ - blr; - -#define CHECK_N_BYTES(reg1,reg2,len_reg) \ - sldi r6,len_reg,56; \ - lxvl 32+v4,reg1,r6; \ - lxvl 32+v5,reg2,r6; \ - add reg1,reg1,len_reg; \ - add reg2,reg2,len_reg; \ - vcmpnezb v7,v4,v5; \ - vctzlsbb r6,v7; \ - cmpld cr7,r6,len_reg; \ - blt cr7,L(different); \ - cmpld cr7,r5,len_reg; \ - ble cr7,L(ret0); \ - sub r5,r5,len_reg; \ - - /* TODO: change this to .machine power10 when the minimum required - binutils allows it. */ - .machine power9 -ENTRY_TOCLESS (STRNCMP, 4) - /* Check if size is 0. */ - cmpdi cr0,r5,0 - beq cr0,L(ret0) - andi. r7,r3,4095 - andi. r8,r4,4095 - cmpldi cr0,r7,4096-16 - cmpldi cr1,r8,4096-16 - bgt cr0,L(crosses) - bgt cr1,L(crosses) - COMPARE_16(v4,v5,0) - addi r3,r3,16 - addi r4,r4,16 - -L(crosses): - andi. r7,r3,15 - subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */ - andi. r9,r4,15 - subfic r8,r9,16 /* r8(nalign2) = 16 - (str2 & 15). */ - cmpld cr7,r7,r8 - beq cr7,L(same_aligned) - blt cr7,L(nalign1_min) - - /* nalign2 is minimum and s2 pointer is aligned. */ - CHECK_N_BYTES(r3,r4,r8) - /* Are we on the 64B hunk which crosses a page? */ - andi. r10,r3,63 /* Determine offset into 64B hunk. */ - andi. r8,r3,15 /* The offset into the 16B hunk. */ - neg r7,r3 - andi. r9,r7,15 /* Number of bytes after a 16B cross. */ - rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */ - beq L(compare_64_pagecross) - mtctr r7 - b L(compare_64B_unaligned) - - /* nalign1 is minimum and s1 pointer is aligned. */ -L(nalign1_min): - CHECK_N_BYTES(r3,r4,r7) - /* Are we on the 64B hunk which crosses a page? */ - andi. r10,r4,63 /* Determine offset into 64B hunk. */ - andi. r8,r4,15 /* The offset into the 16B hunk. */ - neg r7,r4 - andi. r9,r7,15 /* Number of bytes after a 16B cross. */ - rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */ - beq L(compare_64_pagecross) - mtctr r7 - - .p2align 5 -L(compare_64B_unaligned): - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - COMPARE_16(v4,v5,48) - addi r3,r3,64 - addi r4,r4,64 - bdnz L(compare_64B_unaligned) - - /* Cross the page boundary of s2, carefully. Only for first - iteration we have to get the count of 64B blocks to be checked. - From second iteration and beyond, loop counter is always 63. */ -L(compare_64_pagecross): - li r11, 63 - mtctr r11 - cmpldi r10,16 - ble L(cross_4) - cmpldi r10,32 - ble L(cross_3) - cmpldi r10,48 - ble L(cross_2) -L(cross_1): - CHECK_N_BYTES(r3,r4,r9) - CHECK_N_BYTES(r3,r4,r8) - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - addi r3,r3,48 - addi r4,r4,48 - b L(compare_64B_unaligned) -L(cross_2): - COMPARE_16(v4,v5,0) - addi r3,r3,16 - addi r4,r4,16 - CHECK_N_BYTES(r3,r4,r9) - CHECK_N_BYTES(r3,r4,r8) - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - addi r3,r3,32 - addi r4,r4,32 - b L(compare_64B_unaligned) -L(cross_3): - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - addi r3,r3,32 - addi r4,r4,32 - CHECK_N_BYTES(r3,r4,r9) - CHECK_N_BYTES(r3,r4,r8) - COMPARE_16(v4,v5,0) - addi r3,r3,16 - addi r4,r4,16 - b L(compare_64B_unaligned) -L(cross_4): - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - addi r3,r3,48 - addi r4,r4,48 - CHECK_N_BYTES(r3,r4,r9) - CHECK_N_BYTES(r3,r4,r8) - b L(compare_64B_unaligned) - -L(same_aligned): - CHECK_N_BYTES(r3,r4,r7) - /* Align s1 to 32B and adjust s2 address. - Use lxvp only if both s1 and s2 are 32B aligned. */ - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - COMPARE_16(v4,v5,48) - addi r3,r3,64 - addi r4,r4,64 - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - addi r5,r5,32 - - clrldi r6,r3,59 - subfic r7,r6,32 - add r3,r3,r7 - add r4,r4,r7 - subf r5,r7,r5 - andi. r7,r4,0x1F - beq cr0,L(32B_aligned_loop) - - .p2align 5 -L(16B_aligned_loop): - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - COMPARE_16(v4,v5,48) - addi r3,r3,64 - addi r4,r4,64 - b L(16B_aligned_loop) - - /* Calculate and return the difference. */ -L(different): - TAIL_FIRST_16B(v4,v5) - - .p2align 5 -L(32B_aligned_loop): - COMPARE_32(v14,v16,0,tail1,tail2) - COMPARE_32(v18,v20,32,tail3,tail4) - COMPARE_32(v22,v24,64,tail5,tail6) - COMPARE_32(v26,v28,96,tail7,tail8) - addi r3,r3,128 - addi r4,r4,128 - b L(32B_aligned_loop) - -L(tail1): TAIL_FIRST_16B(v15,v17) -L(tail2): TAIL_SECOND_16B(v14,v16) -L(tail3): TAIL_FIRST_16B(v19,v21) -L(tail4): TAIL_SECOND_16B(v18,v20) -L(tail5): TAIL_FIRST_16B(v23,v25) -L(tail6): TAIL_SECOND_16B(v22,v24) -L(tail7): TAIL_FIRST_16B(v27,v29) -L(tail8): TAIL_SECOND_16B(v26,v28) - - .p2align 5 -L(ret0): - li r3,0 - blr - -END(STRNCMP) -libc_hidden_builtin_def(strncmp) diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index dc7c5b14e..142e6c24c 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -31,12 +31,11 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ strncase-power8 ifneq (,$(filter %le,$(config-machine))) -sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \ - memmove-power10 memset-power10 rawmemchr-power9 \ - rawmemchr-power10 strcmp-power9 strcmp-power10 \ - strncmp-power9 strncmp-power10 strcpy-power9 strcat-power10 \ - stpcpy-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ - strlen-power10 +sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \ + rawmemchr-power9 rawmemchr-power10 \ + strcmp-power9 strncmp-power9 \ + strcpy-power9 strcat-power10 stpcpy-power9 \ + strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 0a31a5853..de288a0d8 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -164,9 +164,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c. */ IFUNC_IMPL (i, name, strncmp, #ifdef __LITTLE_ENDIAN__ - IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_1 - && hwcap & PPC_FEATURE_HAS_VSX, - __strncmp_power10) IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00 && hwcap & PPC_FEATURE_HAS_ALTIVEC, __strncmp_power9) @@ -229,12 +226,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c. */ IFUNC_IMPL (i, name, memchr, -#ifdef __LITTLE_ENDIAN__ - IFUNC_IMPL_ADD (array, i, memchr, - hwcap2 & PPC_FEATURE2_ARCH_3_1 - && hwcap & PPC_FEATURE_HAS_VSX, - __memchr_power10) -#endif IFUNC_IMPL_ADD (array, i, memchr, hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC, @@ -386,10 +377,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */ IFUNC_IMPL (i, name, strcmp, #ifdef __LITTLE_ENDIAN__ - IFUNC_IMPL_ADD (array, i, strcmp, - (hwcap2 & PPC_FEATURE2_ARCH_3_1) - && (hwcap & PPC_FEATURE_HAS_VSX), - __strcmp_power10) IFUNC_IMPL_ADD (array, i, strcmp, hwcap2 & PPC_FEATURE2_ARCH_3_00 && hwcap & PPC_FEATURE_HAS_ALTIVEC, diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c index b63c7968c..3abd64aed 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c +++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c @@ -25,23 +25,15 @@ extern __typeof (__memchr) __memchr_ppc attribute_hidden; extern __typeof (__memchr) __memchr_power7 attribute_hidden; extern __typeof (__memchr) __memchr_power8 attribute_hidden; -# ifdef __LITTLE_ENDIAN__ -extern __typeof (__memchr) __memchr_power10 attribute_hidden; -# endif /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ libc_ifunc (__memchr, -# ifdef __LITTLE_ENDIAN__ - (hwcap2 & PPC_FEATURE2_ARCH_3_1 - && hwcap & PPC_FEATURE_HAS_VSX) - ? __memchr_power10 : -# endif - (hwcap2 & PPC_FEATURE2_ARCH_2_07 - && hwcap & PPC_FEATURE_HAS_ALTIVEC) - ? __memchr_power8 : - (hwcap & PPC_FEATURE_ARCH_2_06) - ? __memchr_power7 - : __memchr_ppc); + (hwcap2 & PPC_FEATURE2_ARCH_2_07 + && hwcap & PPC_FEATURE_HAS_ALTIVEC) + ? __memchr_power8 : + (hwcap & PPC_FEATURE_ARCH_2_06) + ? __memchr_power7 + : __memchr_ppc); weak_alias (__memchr, memchr) libc_hidden_builtin_def (memchr) diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c index 3c636e3bb..7c77c084a 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c @@ -29,16 +29,12 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden; extern __typeof (strcmp) __strcmp_power8 attribute_hidden; # ifdef __LITTLE_ENDIAN__ extern __typeof (strcmp) __strcmp_power9 attribute_hidden; -extern __typeof (strcmp) __strcmp_power10 attribute_hidden; # endif # undef strcmp libc_ifunc_redirected (__redirect_strcmp, strcmp, # ifdef __LITTLE_ENDIAN__ - (hwcap2 & PPC_FEATURE2_ARCH_3_1 - && hwcap & PPC_FEATURE_HAS_VSX) - ? __strcmp_power10 : (hwcap2 & PPC_FEATURE2_ARCH_3_00 && hwcap & PPC_FEATURE_HAS_ALTIVEC) ? __strcmp_power9 : diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c index 0a664a620..4cfe27fa4 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c @@ -29,7 +29,6 @@ extern __typeof (strncmp) __strncmp_ppc attribute_hidden; extern __typeof (strncmp) __strncmp_power8 attribute_hidden; # ifdef __LITTLE_ENDIAN__ extern __typeof (strncmp) __strncmp_power9 attribute_hidden; -extern __typeof (strncmp) __strncmp_power10 attribute_hidden; # endif # undef strncmp @@ -37,9 +36,6 @@ extern __typeof (strncmp) __strncmp_power10 attribute_hidden; ifunc symbol properly. */ libc_ifunc_redirected (__redirect_strncmp, strncmp, # ifdef __LITTLE_ENDIAN__ - (hwcap2 & PPC_FEATURE2_ARCH_3_1 - && hwcap & PPC_FEATURE_HAS_VSX) - ? __strncmp_power10 : (hwcap2 & PPC_FEATURE2_ARCH_3_00 && hwcap & PPC_FEATURE_HAS_ALTIVEC) ? __strncmp_power9 : diff --git a/sysdeps/pthread/Makefile b/sysdeps/pthread/Makefile index a123e28a5..7fcbc72bc 100644 --- a/sysdeps/pthread/Makefile +++ b/sysdeps/pthread/Makefile @@ -106,6 +106,7 @@ tests += \ tst-cancel28 \ tst-cancel29 \ tst-cancel30 \ + tst-cancel32 \ tst-cleanup0 \ tst-cleanup1 \ tst-cleanup2 \ @@ -271,6 +272,7 @@ tests += \ tst-spin4 \ tst-spin5 \ tst-stack1 \ + tst-stack2 \ tst-stdio1 \ tst-stdio2 \ tst-thrd-detach \ @@ -366,6 +368,7 @@ modules-names += \ tst-atfork4mod \ tst-create1mod \ tst-fini1mod \ + tst-stack2-mod \ tst-tls4moda \ tst-tls4modb \ # modules-names @@ -539,4 +542,12 @@ LDFLAGS-tst-create1 = -Wl,-export-dynamic $(objpfx)tst-create1: $(shared-thread-library) $(objpfx)tst-create1.out: $(objpfx)tst-create1mod.so +$(objpfx)tst-stack2.out: $(objpfx)tst-stack2-mod.so +$(objpfx)tst-stack2-mod.so: $(shared-thread-library) +LDFLAGS-tst-stack2-mod.so = -Wl,-z,execstack +ifeq ($(have-no-error-execstack),yes) +LDFLAGS-tst-stack2-mod.so += -Wl,--no-error-execstack +endif +tst-stack2-ENV = GLIBC_TUNABLES=glibc.rtld.execstack=2 + endif diff --git a/sysdeps/pthread/tst-cancel32.c b/sysdeps/pthread/tst-cancel32.c new file mode 100644 index 000000000..ab550c16b --- /dev/null +++ b/sysdeps/pthread/tst-cancel32.c @@ -0,0 +1,73 @@ +/* Check if pthread_setcanceltype disables asynchronous cancellation + once cancellation happens (BZ 32782) + + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* The pthread_setcanceltype is a cancellation entrypoint, and if + asynchronous is enabled and the cancellation starts (on the second + pthread_setcanceltype call), the asynchronous should not restart + the process. */ + +#include + +#define NITER 1000 +#define NTHREADS 8 + +static void +tf_cleanup (void *arg) +{ +} + +static void * +tf (void *closure) +{ + pthread_cleanup_push (tf_cleanup, NULL); + for (;;) + { + /* The only possible failure for pthread_setcanceltype is an + invalid state type. */ + pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, NULL); + pthread_setcanceltype (PTHREAD_CANCEL_DEFERRED, NULL); + } + pthread_cleanup_pop (1); + + return NULL; +} + +static void +poll_threads (int nthreads) +{ + pthread_t thr[nthreads]; + for (int i = 0; i < nthreads; i++) + thr[i] = xpthread_create (NULL, tf, NULL); + for (int i = 0; i < nthreads; i++) + xpthread_cancel (thr[i]); + for (int i = 0; i < nthreads; i++) + xpthread_join (thr[i]); +} + +static int +do_test (void) +{ + for (int k = 0; k < NITER; k++) + poll_threads (NTHREADS); + + return 0; +} + +#include diff --git a/sysdeps/pthread/tst-stack2-mod.c b/sysdeps/pthread/tst-stack2-mod.c new file mode 100644 index 000000000..806fdbcd8 --- /dev/null +++ b/sysdeps/pthread/tst-stack2-mod.c @@ -0,0 +1,39 @@ +/* Check if pthread_getattr_np works within modules with non-exectuble + stacks (BZ 32897). + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +bool init_result; + +void +__attribute__ ((constructor)) +init (void) +{ + pthread_t me = pthread_self (); + pthread_attr_t attr; + init_result = pthread_getattr_np (me, &attr) == 0; +} + +int +mod_func (void) +{ + pthread_t me = pthread_self (); + pthread_attr_t attr; + return pthread_getattr_np (me, &attr); +} diff --git a/sysdeps/pthread/tst-stack2.c b/sysdeps/pthread/tst-stack2.c new file mode 100644 index 000000000..20ab5af16 --- /dev/null +++ b/sysdeps/pthread/tst-stack2.c @@ -0,0 +1,47 @@ +/* Check if pthread_getattr_np works within modules with non-exectuble + stacks (BZ 32897). + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include + +static int +do_test (void) +{ + { + pthread_t me = pthread_self (); + pthread_attr_t attr; + TEST_COMPARE (pthread_getattr_np (me, &attr), 0); + } + + void *h = xdlopen ("tst-stack2-mod.so", RTLD_NOW); + + bool *init_result = xdlsym (h, "init_result"); + TEST_COMPARE (*init_result, true); + + int (*mod_func)(void) = xdlsym (h, "mod_func"); + TEST_COMPARE (mod_func (), 0); + + xdlclose (h); + + return 0; +} + +#include diff --git a/sysdeps/riscv/dl-machine.h b/sysdeps/riscv/dl-machine.h index a30892f08..dcc3e0883 100644 --- a/sysdeps/riscv/dl-machine.h +++ b/sysdeps/riscv/dl-machine.h @@ -348,7 +348,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], gotplt[1] = (ElfW(Addr)) l; } - if (l->l_type == lt_executable && l->l_relocated) +#ifdef SHARED + if (l->l_type == lt_executable) { /* The __global_pointer$ may not be defined by the linker if the $gp register does not be used to access the global variable @@ -362,12 +363,16 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], _dl_lookup_symbol_x ("__global_pointer$", l, &ref, l->l_scope, NULL, 0, 0, NULL); if (ref) - asm ( - "mv gp, %0\n" - : - : "r" (ref->st_value) - ); + asm ( + "mv gp, %0\n" + : + : "r" (ref->st_value + l->l_addr) + /* Don't use SYMBOL_ADDRESS here since __global_pointer$ + can be SHN_ABS type, but we need the address relative to + PC, not the absolute address. */ + ); } +#endif #endif return lazy; } diff --git a/sysdeps/sparc/sparc32/start.S b/sysdeps/sparc/sparc32/start.S index 694b020ce..8393760da 100644 --- a/sysdeps/sparc/sparc32/start.S +++ b/sysdeps/sparc/sparc32/start.S @@ -35,6 +35,7 @@ #include +#define FRAME_SIZE 104 .section ".text" .align 4 @@ -48,12 +49,12 @@ _start: /* Terminate the stack frame, and reserve space for functions to drop their arguments. */ mov %g0, %fp - sub %sp, 6*4, %sp + sub %sp, FRAME_SIZE, %sp /* Extract the arguments and environment as encoded on the stack. The argument info starts after one register window (16 words) past the SP. */ - ld [%sp+22*4], %o1 - add %sp, 23*4, %o2 + ld [%sp+168], %o1 + add %sp, 172, %o2 /* Load the addresses of the user entry points. */ #ifndef PIC @@ -73,6 +74,10 @@ _start: be NULL. */ mov %g1, %o5 + /* Provide the highest stack address to update the __libc_stack_end (used + to enable executable stacks if required). */ + st %sp, [%sp+23*4] + /* Let libc do the rest of the initialization, and call main. */ call __libc_start_main nop diff --git a/sysdeps/sparc/sparc64/start.S b/sysdeps/sparc/sparc64/start.S index c9c25c2e4..08e1e7721 100644 --- a/sysdeps/sparc/sparc64/start.S +++ b/sysdeps/sparc/sparc64/start.S @@ -74,6 +74,10 @@ _start: be NULL. */ mov %g1, %o5 + /* Provide the highest stack address to update the __libc_stack_end (used + to enable executable stacks if required). */ + stx %sp, [%sp+STACK_BIAS+22*8] + /* Let libc do the rest of the initialization, and call main. */ call __libc_start_main nop diff --git a/sysdeps/unix/sysv/linux/aarch64/Makefile b/sysdeps/unix/sysv/linux/aarch64/Makefile index 1fdad67fa..0839f0b08 100644 --- a/sysdeps/unix/sysv/linux/aarch64/Makefile +++ b/sysdeps/unix/sysv/linux/aarch64/Makefile @@ -3,7 +3,134 @@ sysdep_headers += sys/elf.h tests += \ tst-aarch64-pkey \ # tests -endif + +ifneq (no,$(findstring no,$(have-cc-gcs) $(have-test-cc-gcs) $(have-ld-gcs))) + +gcs-tests-dynamic = \ + tst-gcs-disabled \ + tst-gcs-dlopen-disabled \ + tst-gcs-dlopen-enforced \ + tst-gcs-dlopen-optional-off \ + tst-gcs-dlopen-optional-on \ + tst-gcs-dlopen-override \ + tst-gcs-enforced \ + tst-gcs-enforced-abort \ + tst-gcs-noreturn \ + tst-gcs-optional-off \ + tst-gcs-optional-on \ + tst-gcs-override \ + tst-gcs-shared-disabled \ + tst-gcs-shared-enforced-abort \ + tst-gcs-shared-optional \ + tst-gcs-shared-override \ + # gcs-tests-dynamic + +gcs-tests-static = \ + tst-gcs-disabled-static \ + tst-gcs-enforced-static \ + tst-gcs-enforced-static-abort \ + tst-gcs-optional-static-off \ + tst-gcs-optional-static-on \ + tst-gcs-override-static \ + # gcs-tests-static + +tests += \ + $(gcs-tests-dynamic) \ + $(gcs-tests-static) \ + # tests + +tests-static += \ + $(gcs-tests-static) \ + # tests-static + +define run-gcs-abort-test + $(test-wrapper-env) $(run-program-env) \ + $(tst-gcs-$*-abort-ENV) $(host-test-program-cmd) +endef + +$(objpfx)tst-gcs-%-abort.out: $(..)sysdeps/unix/sysv/linux/aarch64/tst-gcs-abort.sh \ + $(objpfx)tst-gcs-%-abort + $(SHELL) $< $(common-objpfx) $(test-name) '$(run-gcs-abort-test)'; \ + $(evaluate-test) + +LDFLAGS-tst-gcs-disabled += -Wl,-z gcs=always +LDFLAGS-tst-gcs-enforced += -Wl,-z gcs=always +LDFLAGS-tst-gcs-enforced-abort += -Wl,-z gcs=never +LDFLAGS-tst-gcs-optional-on += -Wl,-z gcs=always +LDFLAGS-tst-gcs-optional-off += -Wl,-z gcs=never +LDFLAGS-tst-gcs-override += -Wl,-z gcs=never + +LDFLAGS-tst-gcs-disabled-static += -Wl,-z gcs=always +LDFLAGS-tst-gcs-enforced-static += -Wl,-z gcs=always +LDFLAGS-tst-gcs-enforced-static-abort += -Wl,-z gcs=never +LDFLAGS-tst-gcs-optional-static-on += -Wl,-z gcs=always +LDFLAGS-tst-gcs-optional-static-off += -Wl,-z gcs=never +LDFLAGS-tst-gcs-override-static += -Wl,-z gcs=never + +tst-gcs-disabled-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=0 +tst-gcs-enforced-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1 +tst-gcs-enforced-abort-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1 +tst-gcs-optional-on-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2 +tst-gcs-optional-off-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2 +tst-gcs-override-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=3 + +tst-gcs-disabled-static-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=0 +tst-gcs-enforced-static-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1 +tst-gcs-enforced-static-abort-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1 +tst-gcs-optional-static-on-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2 +tst-gcs-optional-static-off-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2 +tst-gcs-override-static-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=3 + +# force one of the dependencies to be unmarked +LDFLAGS-tst-gcs-mod2.so += -Wl,-z gcs=never + +LDFLAGS-tst-gcs-shared-disabled = -Wl,-z gcs=always +LDFLAGS-tst-gcs-shared-enforced-abort = -Wl,-z gcs=always +LDFLAGS-tst-gcs-shared-optional = -Wl,-z gcs=always +LDFLAGS-tst-gcs-shared-override = -Wl,-z gcs=always + +modules-names += \ + tst-gcs-mod1 \ + tst-gcs-mod2 \ + tst-gcs-mod3 \ + # modules-names + +$(objpfx)tst-gcs-shared-disabled: $(objpfx)tst-gcs-mod1.so $(objpfx)tst-gcs-mod3.so +$(objpfx)tst-gcs-shared-enforced-abort: $(objpfx)tst-gcs-mod1.so $(objpfx)tst-gcs-mod3.so +$(objpfx)tst-gcs-shared-optional: $(objpfx)tst-gcs-mod1.so $(objpfx)tst-gcs-mod3.so +$(objpfx)tst-gcs-shared-override: $(objpfx)tst-gcs-mod1.so $(objpfx)tst-gcs-mod3.so +$(objpfx)tst-gcs-mod1.so: $(objpfx)tst-gcs-mod2.so + +tst-gcs-shared-disabled-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=0 +tst-gcs-shared-enforced-abort-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1 +tst-gcs-shared-optional-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2 +tst-gcs-shared-override-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=3 + +LDFLAGS-tst-gcs-dlopen-disabled = -Wl,-z gcs=always +LDFLAGS-tst-gcs-dlopen-enforced = -Wl,-z gcs=always +LDFLAGS-tst-gcs-dlopen-optional-on = -Wl,-z gcs=always +LDFLAGS-tst-gcs-dlopen-optional-off = -Wl,-z gcs=never +LDFLAGS-tst-gcs-dlopen-override = -Wl,-z gcs=always + +tst-gcs-dlopen-disabled-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=0 +tst-gcs-dlopen-enforced-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1 +tst-gcs-dlopen-optional-on-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2 +tst-gcs-dlopen-optional-off-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2 +tst-gcs-dlopen-override-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=3 + +$(objpfx)tst-gcs-dlopen-disabled.out: $(objpfx)tst-gcs-mod2.so +$(objpfx)tst-gcs-dlopen-enforced.out: $(objpfx)tst-gcs-mod2.so +$(objpfx)tst-gcs-dlopen-optional-on.out: $(objpfx)tst-gcs-mod2.so +$(objpfx)tst-gcs-dlopen-optional-off.out: $(objpfx)tst-gcs-mod2.so +$(objpfx)tst-gcs-dlopen-override.out: $(objpfx)tst-gcs-mod2.so + +LDFLAGS-tst-gcs-noreturn = -Wl,-z gcs=always + +tst-gcs-noreturn-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=0 + +endif # ifeq ($(have-test-cc-gcs),yes) + +endif # ifeq ($(subdir),misc) ifeq ($(subdir),stdlib) gen-as-const-headers += ucontext_i.sym diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c index 6d63c8a9e..1acc82d07 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c @@ -23,6 +23,7 @@ #include #include #include +#include #define DCZID_DZP_MASK (1 << 4) #define DCZID_BS_MASK (0xf) diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c b/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c index 3ff33ef72..c884efc3b 100644 --- a/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c +++ b/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c @@ -55,6 +55,10 @@ do_test (void) if (errno == ENOSYS || errno == EINVAL) FAIL_UNSUPPORTED ("kernel or CPU does not support memory protection keys"); + if (errno == ENOSPC) + FAIL_UNSUPPORTED + ("no keys available or kernel does not support memory" + " protection keys"); FAIL_EXIT1 ("pkey_alloc: %m"); } diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-abort.sh b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-abort.sh new file mode 100644 index 000000000..9e2be2d5c --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-abort.sh @@ -0,0 +1,39 @@ +#!/bin/sh +# Test wrapper for AArch64 tests for GCS that are expected to abort. +# Copyright (C) 2025 Free Software Foundation, Inc. +# This file is part of the GNU C Library. + +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. + +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. + +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +objpfx=$1; shift +tstname=$1; shift +tstrun=$1; shift + +logfile=$objpfx/$tstname.out + +rm -vf $logfile +touch $logfile + +${tstrun} 2>> $logfile >> $logfile; status=$? + +if test $status -eq 127 \ + && grep -q -w "not GCS compatible" "$logfile" ; then + exit 0 +elif test $status -eq 77; then + exit 77 +else + echo "unexpected test output or exit status $status" + exit 1 +fi diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled-static.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled-static.c new file mode 100644 index 000000000..c71d68cb8 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled-static.c @@ -0,0 +1 @@ +#include "tst-gcs-disabled.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled.c new file mode 100644 index 000000000..bd688785b --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled.c @@ -0,0 +1,2 @@ +#define TEST_GCS_EXPECT_ENABLED 0 +#include "tst-gcs-skeleton.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-disabled.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-disabled.c new file mode 100644 index 000000000..34395280a --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-disabled.c @@ -0,0 +1,3 @@ +#define TEST_GCS_EXPECT_ENABLED 0 +#define TEST_GCS_EXPECT_DLOPEN 1 +#include "tst-gcs-dlopen.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-enforced.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-enforced.c new file mode 100644 index 000000000..d8489ecd2 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-enforced.c @@ -0,0 +1,3 @@ +#define TEST_GCS_EXPECT_ENABLED 1 +#define TEST_GCS_EXPECT_DLOPEN 0 +#include "tst-gcs-dlopen.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-off.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-off.c new file mode 100644 index 000000000..34395280a --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-off.c @@ -0,0 +1,3 @@ +#define TEST_GCS_EXPECT_ENABLED 0 +#define TEST_GCS_EXPECT_DLOPEN 1 +#include "tst-gcs-dlopen.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-on.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-on.c new file mode 100644 index 000000000..d8489ecd2 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-on.c @@ -0,0 +1,3 @@ +#define TEST_GCS_EXPECT_ENABLED 1 +#define TEST_GCS_EXPECT_DLOPEN 0 +#include "tst-gcs-dlopen.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-override.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-override.c new file mode 100644 index 000000000..152ffcf20 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-override.c @@ -0,0 +1,3 @@ +#define TEST_GCS_EXPECT_ENABLED 1 +#define TEST_GCS_EXPECT_DLOPEN 1 +#include "tst-gcs-dlopen.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen.c new file mode 100644 index 000000000..6e0801c63 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen.c @@ -0,0 +1,62 @@ +/* AArch64 tests for GCS for dlopen use case. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "tst-gcs-helper.h" + +#include +#include + +static int +do_test (void) +{ + /* Check if GCS could possible by enabled. */ + if (!(getauxval (AT_HWCAP) & HWCAP_GCS)) + { + puts ("kernel or CPU does not support GCS"); + return EXIT_UNSUPPORTED; + } + /* The tst-gcs-mod2.so test library does not have GCS marking. */ + void *h = dlopen ("tst-gcs-mod2.so", RTLD_NOW); + const char *err = dlerror (); + +#if TEST_GCS_EXPECT_DLOPEN + TEST_VERIFY (h != NULL); +#else + TEST_VERIFY (h == NULL); + /* Only accept expected GCS-related errors. */ + TEST_VERIFY (strstr (err, "not GCS compatible") != NULL); +#endif + +#if TEST_GCS_EXPECT_ENABLED + TEST_VERIFY (__check_gcs_status ()); +#else + TEST_VERIFY (!__check_gcs_status ()); +#endif + + if (h == NULL) + printf ("dlopen error: %s\n", err); + else + { + puts ("library loaded normally"); + dlclose (h); + } + + return 0; +} + +#include diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-abort.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-abort.c new file mode 100644 index 000000000..608318f26 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-abort.c @@ -0,0 +1,2 @@ +#define TEST_GCS_EXPECT_ENABLED 1 +#include "tst-gcs-skeleton.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static-abort.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static-abort.c new file mode 100644 index 000000000..c20a999f6 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static-abort.c @@ -0,0 +1 @@ +#include "tst-gcs-enforced-abort.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static.c new file mode 100644 index 000000000..bb39dada5 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static.c @@ -0,0 +1 @@ +#include "tst-gcs-enforced.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced.c new file mode 100644 index 000000000..608318f26 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced.c @@ -0,0 +1,2 @@ +#define TEST_GCS_EXPECT_ENABLED 1 +#include "tst-gcs-skeleton.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-helper.h b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-helper.h new file mode 100644 index 000000000..d8a586d2d --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-helper.h @@ -0,0 +1,39 @@ +/* AArch64 tests for GCS. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef TST_GCS_HELPER_H +#define TST_GCS_HELPER_H + +#include +#include +#include + +#include +#include + +static bool __check_gcs_status (void) +{ + register unsigned long x16 asm ("x16"); + asm volatile ( + "mov x16, #1 /* _CHKFEAT_GCS */\n" + "hint 40 /* CHKFEAT_X16 */\n" + : "=r" (x16)); + return x16 ^ 1; +} + +#endif // POINTER_GUARD_H diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod1.c similarity index 72% rename from sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S rename to sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod1.c index 43879085e..931ff8179 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod1.c @@ -1,4 +1,5 @@ -/* Copyright (C) 2024-2025 Free Software Foundation, Inc. +/* DSO for testing GCS. + Copyright (C) 2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -15,11 +16,12 @@ License along with the GNU C Library; if not, see . */ -#if defined __LITTLE_ENDIAN__ && IS_IN (libc) -#define STRNCMP __strncmp_power10 +#include -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(name) +int fun2 (void); // tst-gcs-mod2.c -#include -#endif +int fun1 (void) +{ + puts ("called function fun1"); + return fun2 (); +} diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod2.c similarity index 66% rename from sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S rename to sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod2.c index c9d2f4efd..f9370eb8f 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod2.c @@ -1,5 +1,5 @@ -/* Optimized memchr implementation for POWER10/PPC64. - Copyright (C) 2016-2025 Free Software Foundation, Inc. +/* DSO for testing GCS. + Copyright (C) 2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,13 +16,10 @@ License along with the GNU C Library; if not, see . */ -#if defined __LITTLE_ENDIAN__ && IS_IN (libc) -#define MEMCHR __memchr_power10 +#include -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(name) -#undef weak_alias -#define weak_alias(name,alias) - -#include -#endif +int fun2 (void) +{ + puts ("called function fun2"); + return 0; +} diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod3.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod3.c new file mode 100644 index 000000000..38bb35754 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod3.c @@ -0,0 +1,25 @@ +/* DSO for testing GCS. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +int fun3 (void) +{ + puts ("called function fun3"); + return 0; +} diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-noreturn.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-noreturn.c new file mode 100644 index 000000000..f55057924 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-noreturn.c @@ -0,0 +1,101 @@ +/* AArch64 test for GCS abort when returning to non-GCS address. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "tst-gcs-helper.h" + +#include +#include + +#include + +# ifndef PR_SET_SHADOW_STACK_STATUS +# define PR_SET_SHADOW_STACK_STATUS 75 +# define PR_SHADOW_STACK_ENABLE (1UL << 0) +# endif + +static void +run_with_gcs (void) +{ + int r = prctl (PR_SET_SHADOW_STACK_STATUS, PR_SHADOW_STACK_ENABLE, 0, 0, 0); + /* Syscall should succeed. */ + TEST_VERIFY (r == 0); + bool gcs_enabled = __check_gcs_status (); + /* Now GCS should be enabled. */ + TEST_VERIFY (gcs_enabled); + printf ("GCS is %s\n", gcs_enabled ? "enabled" : "disabled"); +} + +static struct _aarch64_ctx * +extension (void *p) +{ + return p; +} + +#ifndef GCS_MAGIC +#define GCS_MAGIC 0x47435300 +#endif + +static void +handler (int sig, siginfo_t *si, void *ctx) +{ + TEST_VERIFY (sig == SIGSEGV); + ucontext_t *uc = ctx; + void *p = uc->uc_mcontext.__reserved; + if (extension (p)->magic == FPSIMD_MAGIC) + p = (char *)p + extension (p)->size; + if (extension (p)->magic == GCS_MAGIC) + { + struct { uint64_t x, gcspr, y, z; } *q = p; + printf ("GCS pointer: %016lx\n", q->gcspr); + exit (0); + } + else + exit (3); +} + +static int +do_test (void) +{ + /* Check if GCS could possible by enabled. */ + if (!(getauxval (AT_HWCAP) & HWCAP_GCS)) + { + puts ("kernel or CPU does not support GCS"); + return EXIT_UNSUPPORTED; + } + bool gcs_enabled = __check_gcs_status (); + /* This test should be rung with GCS initially disabled. */ + TEST_VERIFY (!gcs_enabled); + + /* We can't use EXPECTED_SIGNAL because of cases when + this test runs on a system that does not support GCS + which is being detected at runtime. */ + struct sigaction sigact; + sigemptyset (&sigact.sa_mask); + sigact.sa_flags = 0; + sigact.sa_flags = sigact.sa_flags | SA_SIGINFO; + sigact.sa_sigaction = handler; + xsigaction (SIGSEGV, &sigact, NULL); + + run_with_gcs (); + /* If we reached this point, then something went wrong. + Returning from a function that enabled GCS should result in + SIGSEGV that we catch with the handler set up above. */ + return 2; +} + +#include diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-off.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-off.c new file mode 100644 index 000000000..bd688785b --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-off.c @@ -0,0 +1,2 @@ +#define TEST_GCS_EXPECT_ENABLED 0 +#include "tst-gcs-skeleton.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-on.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-on.c new file mode 100644 index 000000000..608318f26 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-on.c @@ -0,0 +1,2 @@ +#define TEST_GCS_EXPECT_ENABLED 1 +#include "tst-gcs-skeleton.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-off.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-off.c new file mode 100644 index 000000000..54e3b9a0d --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-off.c @@ -0,0 +1 @@ +#include "tst-gcs-optional-off.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-on.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-on.c new file mode 100644 index 000000000..11b884b42 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-on.c @@ -0,0 +1 @@ +#include "tst-gcs-optional-on.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-override-static.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-override-static.c new file mode 100644 index 000000000..09055dcdc --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-override-static.c @@ -0,0 +1 @@ +#include "tst-gcs-override.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-override.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-override.c new file mode 100644 index 000000000..608318f26 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-override.c @@ -0,0 +1,2 @@ +#define TEST_GCS_EXPECT_ENABLED 1 +#include "tst-gcs-skeleton.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-disabled.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-disabled.c new file mode 100644 index 000000000..8598dc44b --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-disabled.c @@ -0,0 +1,2 @@ +#define TEST_GCS_EXPECT_ENABLED 0 +#include "tst-gcs-shared.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-enforced-abort.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-enforced-abort.c new file mode 100644 index 000000000..f1333cee9 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-enforced-abort.c @@ -0,0 +1,2 @@ +#define TEST_GCS_EXPECT_ENABLED 1 +#include "tst-gcs-shared.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-optional.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-optional.c new file mode 100644 index 000000000..8598dc44b --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-optional.c @@ -0,0 +1,2 @@ +#define TEST_GCS_EXPECT_ENABLED 0 +#include "tst-gcs-shared.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-override.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-override.c new file mode 100644 index 000000000..f1333cee9 --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-override.c @@ -0,0 +1,2 @@ +#define TEST_GCS_EXPECT_ENABLED 1 +#include "tst-gcs-shared.c" diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared.c new file mode 100644 index 000000000..1192de69f --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared.c @@ -0,0 +1,41 @@ +/* AArch64 tests for GCS. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "tst-gcs-helper.h" + +int fun1 (void); // tst-gcs-mod1.c +int fun3 (void); // tst-gcs-mod3.c + +static int +do_test (void) +{ + /* Check if GCS could possible by enabled. */ + if (!(getauxval (AT_HWCAP) & HWCAP_GCS)) + { + puts ("kernel or CPU does not support GCS"); + return EXIT_UNSUPPORTED; + } +#if TEST_GCS_EXPECT_ENABLED + TEST_VERIFY (__check_gcs_status ()); +#else + TEST_VERIFY (!__check_gcs_status ()); +#endif + return fun1 () + fun3 (); +} + +#include diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-skeleton.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-skeleton.c new file mode 100644 index 000000000..feb5e33eb --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-skeleton.c @@ -0,0 +1,43 @@ +/* AArch64 tests for GCS. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "tst-gcs-helper.h" + +static int +do_test (void) +{ + /* Check if GCS could possible by enabled. */ + if (!(getauxval (AT_HWCAP) & HWCAP_GCS)) + { + puts ("kernel or CPU does not support GCS"); + return EXIT_UNSUPPORTED; + } + bool gcs_enabled = __check_gcs_status (); + if (gcs_enabled) + puts ("GCS enabled"); + else + puts ("GCS not enabled"); +#if TEST_GCS_EXPECT_ENABLED + TEST_VERIFY (gcs_enabled); +#else + TEST_VERIFY (!gcs_enabled); +#endif + return 0; +} + +#include diff --git a/sysdeps/unix/sysv/linux/bits/sched.h b/sysdeps/unix/sysv/linux/bits/sched.h index 3656e98ed..39b0b3d19 100644 --- a/sysdeps/unix/sysv/linux/bits/sched.h +++ b/sysdeps/unix/sysv/linux/bits/sched.h @@ -152,7 +152,7 @@ int sched_setattr (pid_t tid, struct sched_attr *attr, unsigned int flags) store it in *ATTR. */ int sched_getattr (pid_t tid, struct sched_attr *attr, unsigned int size, unsigned int flags) - __THROW __nonnull ((2)) __attr_access ((__write_only__, 2, 3)); + __THROW __nonnull ((2)); #endif diff --git a/sysdeps/unix/sysv/linux/dl-execstack.c b/sysdeps/unix/sysv/linux/dl-execstack.c index 9791b339c..6db960165 100644 --- a/sysdeps/unix/sysv/linux/dl-execstack.c +++ b/sysdeps/unix/sysv/linux/dl-execstack.c @@ -19,10 +19,10 @@ #include int -_dl_make_stack_executable (void **stack_endp) +_dl_make_stack_executable (const void *stack_endp) { /* This gives us the highest/lowest page that needs to be changed. */ - uintptr_t page = ((uintptr_t) *stack_endp + uintptr_t page = ((uintptr_t) stack_endp & -(intptr_t) GLRO(dl_pagesize)); if (__mprotect ((void *) page, GLRO(dl_pagesize), @@ -35,9 +35,6 @@ _dl_make_stack_executable (void **stack_endp) ) != 0) return errno; - /* Clear the address. */ - *stack_endp = NULL; - /* Remember that we changed the permission. */ GL(dl_stack_flags) |= PF_X; diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h index f89e78424..d2ab4cb82 100644 --- a/sysdeps/unix/sysv/linux/rseq-internal.h +++ b/sysdeps/unix/sysv/linux/rseq-internal.h @@ -108,13 +108,12 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq) if (size < RSEQ_AREA_SIZE_INITIAL) size = RSEQ_AREA_SIZE_INITIAL; - /* Initialize the rseq fields that are read by the kernel on - registration, there is no guarantee that struct pthread is - cleared on all architectures. */ + /* Initialize the whole rseq area to zero prior to registration. */ + memset (RSEQ_SELF (), 0, size); + + /* Set the cpu_id field to RSEQ_CPU_ID_UNINITIALIZED, this is checked by + the kernel at registration when CONFIG_DEBUG_RSEQ is enabled. */ RSEQ_SETMEM (cpu_id, RSEQ_CPU_ID_UNINITIALIZED); - RSEQ_SETMEM (cpu_id_start, 0); - RSEQ_SETMEM (rseq_cs, 0); - RSEQ_SETMEM (flags, 0); int ret = INTERNAL_SYSCALL_CALL (rseq, RSEQ_SELF (), size, 0, RSEQ_SIG); if (!INTERNAL_SYSCALL_ERROR_P (ret)) diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile index 5311b594a..01b0192dd 100644 --- a/sysdeps/x86/Makefile +++ b/sysdeps/x86/Makefile @@ -21,6 +21,9 @@ tests += \ tst-cpu-features-supports-static \ tst-get-cpu-features \ tst-get-cpu-features-static \ + tst-gnu2-tls2-x86-noxsave \ + tst-gnu2-tls2-x86-noxsavec \ + tst-gnu2-tls2-x86-noxsavexsavec \ tst-hwcap-tunables \ # tests tests-static += \ @@ -91,6 +94,25 @@ CFLAGS-tst-gnu2-tls2.c += -msse CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell + +LDFLAGS-tst-gnu2-tls2-x86-noxsave += -Wl,-z,lazy +LDFLAGS-tst-gnu2-tls2-x86-noxsavec += -Wl,-z,lazy +LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy + +# Test for bug 32810: incorrect XSAVE state size if XSAVEC is disabled +# via tunable. +tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE +tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC +tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC +$(objpfx)tst-gnu2-tls2-x86-noxsave: $(shared-thread-library) +$(objpfx)tst-gnu2-tls2-x86-noxsavec: $(shared-thread-library) +$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec: $(shared-thread-library) +$(objpfx)tst-gnu2-tls2-x86-noxsave.out \ +$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \ +$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \ + $(objpfx)tst-gnu2-tls2mod0.so \ + $(objpfx)tst-gnu2-tls2mod1.so \ + $(objpfx)tst-gnu2-tls2mod2.so endif ifeq ($(subdir),math) diff --git a/sysdeps/x86/bits/floatn.h b/sysdeps/x86/bits/floatn.h index d197cb10d..4674165bd 100644 --- a/sysdeps/x86/bits/floatn.h +++ b/sysdeps/x86/bits/floatn.h @@ -25,11 +25,15 @@ floating-point type with the IEEE 754 binary128 format, and this glibc includes corresponding *f128 interfaces for it. The required libgcc support was added some time after the basic compiler - support, for x86_64 and x86. */ + support, for x86_64 and x86. Intel SYCL compiler doesn't support + _Float128: https://github.com/intel/llvm/issues/16903 + */ #if (defined __x86_64__ \ ? __GNUC_PREREQ (4, 3) \ : (defined __GNU__ ? __GNUC_PREREQ (4, 5) : __GNUC_PREREQ (4, 4))) \ - || __glibc_clang_prereq (3, 4) + || (__glibc_clang_prereq (3, 9) \ + && (!defined __INTEL_LLVM_COMPILER \ + || !defined SYCL_LANGUAGE_VERSION)) # define __HAVE_FLOAT128 1 #else # define __HAVE_FLOAT128 0 @@ -89,7 +93,7 @@ typedef _Complex float __cfloat128 __attribute__ ((__mode__ (__TC__))); /* The type _Float128 exists only since GCC 7.0. */ # if !__GNUC_PREREQ (7, 0) \ || (defined __cplusplus && !__GNUC_PREREQ (13, 0)) \ - || __glibc_clang_prereq (3, 4) + || __glibc_clang_prereq (3, 9) typedef __float128 _Float128; # endif diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 27abaca8b..e50f1d693 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -24,6 +24,7 @@ #include #include #include +#include extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *) attribute_hidden; @@ -83,6 +84,8 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *) # include #endif +unsigned long int _dl_x86_features_tlsdesc_state_size; + static void update_active (struct cpu_features *cpu_features) { @@ -317,17 +320,13 @@ update_active (struct cpu_features *cpu_features) = xsave_state_full_size; cpu_features->xsave_state_full_size = xsave_state_full_size; + _dl_x86_features_tlsdesc_state_size = xsave_state_full_size; /* Check if XSAVEC is available. */ if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC)) { - unsigned int xstate_comp_offsets[32]; - unsigned int xstate_comp_sizes[32]; -#ifdef __x86_64__ - unsigned int xstate_amx_comp_offsets[32]; - unsigned int xstate_amx_comp_sizes[32]; - unsigned int amx_ecx; -#endif + unsigned int xstate_comp_offsets[X86_XSTATE_MAX_ID + 1]; + unsigned int xstate_comp_sizes[X86_XSTATE_MAX_ID + 1]; unsigned int i; xstate_comp_offsets[0] = 0; @@ -335,39 +334,16 @@ update_active (struct cpu_features *cpu_features) xstate_comp_offsets[2] = 576; xstate_comp_sizes[0] = 160; xstate_comp_sizes[1] = 256; -#ifdef __x86_64__ - xstate_amx_comp_offsets[0] = 0; - xstate_amx_comp_offsets[1] = 160; - xstate_amx_comp_offsets[2] = 576; - xstate_amx_comp_sizes[0] = 160; - xstate_amx_comp_sizes[1] = 256; -#endif - for (i = 2; i < 32; i++) + for (i = 2; i <= X86_XSTATE_MAX_ID; i++) { if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0) { __cpuid_count (0xd, i, eax, ebx, ecx, edx); -#ifdef __x86_64__ - /* Include this in xsave_state_full_size. */ - amx_ecx = ecx; - xstate_amx_comp_sizes[i] = eax; - if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0) - { - /* Exclude this from xsave_state_size. */ - ecx = 0; - xstate_comp_sizes[i] = 0; - } - else -#endif - xstate_comp_sizes[i] = eax; + xstate_comp_sizes[i] = eax; } else { -#ifdef __x86_64__ - amx_ecx = 0; - xstate_amx_comp_sizes[i] = 0; -#endif ecx = 0; xstate_comp_sizes[i] = 0; } @@ -376,44 +352,32 @@ update_active (struct cpu_features *cpu_features) { xstate_comp_offsets[i] = (xstate_comp_offsets[i - 1] - + xstate_comp_sizes[i -1]); + + xstate_comp_sizes[i - 1]); if ((ecx & (1 << 1)) != 0) xstate_comp_offsets[i] = ALIGN_UP (xstate_comp_offsets[i], 64); -#ifdef __x86_64__ - xstate_amx_comp_offsets[i] - = (xstate_amx_comp_offsets[i - 1] - + xstate_amx_comp_sizes[i - 1]); - if ((amx_ecx & (1 << 1)) != 0) - xstate_amx_comp_offsets[i] - = ALIGN_UP (xstate_amx_comp_offsets[i], - 64); -#endif } } /* Use XSAVEC. */ unsigned int size - = xstate_comp_offsets[31] + xstate_comp_sizes[31]; + = (xstate_comp_offsets[X86_XSTATE_MAX_ID] + + xstate_comp_sizes[X86_XSTATE_MAX_ID]); if (size) { + size = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, + 64); #ifdef __x86_64__ - unsigned int amx_size - = (xstate_amx_comp_offsets[31] - + xstate_amx_comp_sizes[31]); - amx_size - = ALIGN_UP ((amx_size - + TLSDESC_CALL_REGISTER_SAVE_AREA), - 64); - /* Set xsave_state_full_size to the compact AMX - state size for XSAVEC. NB: xsave_state_full_size - is only used in _dl_tlsdesc_dynamic_xsave and - _dl_tlsdesc_dynamic_xsavec. */ - cpu_features->xsave_state_full_size = amx_size; + _dl_x86_features_tlsdesc_state_size = size; + /* Exclude the AMX space from the start of TILECFG + space to the end of TILEDATA space. If CPU + doesn't support AMX, TILECFG offset is the same + as TILEDATA + 1 offset. Otherwise, they are + multiples of 64. */ + size -= (xstate_comp_offsets[X86_XSTATE_TILEDATA_ID + 1] + - xstate_comp_offsets[X86_XSTATE_TILECFG_ID]); #endif - cpu_features->xsave_state_size - = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, - 64); + cpu_features->xsave_state_size = size; CPU_FEATURE_SET (cpu_features, XSAVEC); } } @@ -538,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load "Incorrect index_arch_Fast_Unaligned_Load"); -/* Intel Family-6 microarch list. */ -enum +/* Intel microarch list. */ +enum intel_microarch { /* Atom processors. */ INTEL_ATOM_BONNELL, @@ -548,6 +512,7 @@ enum INTEL_ATOM_GOLDMONT, INTEL_ATOM_GOLDMONT_PLUS, INTEL_ATOM_SIERRAFOREST, + INTEL_ATOM_CLEARWATERFOREST, INTEL_ATOM_GRANDRIDGE, INTEL_ATOM_TREMONT, @@ -575,7 +540,9 @@ enum INTEL_BIGCORE_METEORLAKE, INTEL_BIGCORE_LUNARLAKE, INTEL_BIGCORE_ARROWLAKE, + INTEL_BIGCORE_PANTHERLAKE, INTEL_BIGCORE_GRANITERAPIDS, + INTEL_BIGCORE_DIAMONDRAPIDS, /* Mixed (bigcore + atom SOC). */ INTEL_MIXED_LAKEFIELD, @@ -589,7 +556,7 @@ enum INTEL_UNKNOWN, }; -static unsigned int +static enum intel_microarch intel_get_fam6_microarch (unsigned int model, __attribute__ ((unused)) unsigned int stepping) { @@ -620,6 +587,8 @@ intel_get_fam6_microarch (unsigned int model, return INTEL_ATOM_GOLDMONT_PLUS; case 0xAF: return INTEL_ATOM_SIERRAFOREST; + case 0xDD: + return INTEL_ATOM_CLEARWATERFOREST; case 0xB6: return INTEL_ATOM_GRANDRIDGE; case 0x86: @@ -727,8 +696,12 @@ intel_get_fam6_microarch (unsigned int model, return INTEL_BIGCORE_METEORLAKE; case 0xbd: return INTEL_BIGCORE_LUNARLAKE; + case 0xb5: + case 0xc5: case 0xc6: return INTEL_BIGCORE_ARROWLAKE; + case 0xCC: + return INTEL_BIGCORE_PANTHERLAKE; case 0xAD: case 0xAE: return INTEL_BIGCORE_GRANITERAPIDS; @@ -792,133 +765,20 @@ init_cpu_features (struct cpu_features *cpu_features) cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] &= ~bit_arch_Avoid_Non_Temporal_Memset; + enum intel_microarch microarch = INTEL_UNKNOWN; if (family == 0x06) { model += extended_model; - unsigned int microarch - = intel_get_fam6_microarch (model, stepping); + microarch = intel_get_fam6_microarch (model, stepping); + /* Disable TSX on some processors to avoid TSX on kernels that + weren't updated with the latest microcode package (which + disables broken feature by default). */ switch (microarch) { - /* Atom / KNL tuning. */ - case INTEL_ATOM_BONNELL: - /* BSF is slow on Bonnell. */ - cpu_features->preferred[index_arch_Slow_BSF] - |= bit_arch_Slow_BSF; - break; - - /* Unaligned load versions are faster than SSSE3 - on Airmont, Silvermont, Goldmont, and Goldmont Plus. */ - case INTEL_ATOM_AIRMONT: - case INTEL_ATOM_SILVERMONT: - case INTEL_ATOM_GOLDMONT: - case INTEL_ATOM_GOLDMONT_PLUS: - - /* Knights Landing. Enable Silvermont optimizations. */ - case INTEL_KNIGHTS_LANDING: - - cpu_features->preferred[index_arch_Fast_Unaligned_Load] - |= (bit_arch_Fast_Unaligned_Load - | bit_arch_Fast_Unaligned_Copy - | bit_arch_Prefer_PMINUB_for_stringop - | bit_arch_Slow_SSE4_2); - break; - - case INTEL_ATOM_TREMONT: - /* Enable rep string instructions, unaligned load, unaligned - copy, pminub and avoid SSE 4.2 on Tremont. */ - cpu_features->preferred[index_arch_Fast_Rep_String] - |= (bit_arch_Fast_Rep_String - | bit_arch_Fast_Unaligned_Load - | bit_arch_Fast_Unaligned_Copy - | bit_arch_Prefer_PMINUB_for_stringop - | bit_arch_Slow_SSE4_2); - break; - - /* - Default tuned Knights microarch. - case INTEL_KNIGHTS_MILL: - */ - - /* - Default tuned atom microarch. - case INTEL_ATOM_SIERRAFOREST: - case INTEL_ATOM_GRANDRIDGE: - */ - - /* Bigcore/Default Tuning. */ default: - default_tuning: - /* Unknown family 0x06 processors. Assuming this is one - of Core i3/i5/i7 processors if AVX is available. */ - if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) - break; - - enable_modern_features: - /* Rep string instructions, unaligned load, unaligned copy, - and pminub are fast on Intel Core i3, i5 and i7. */ - cpu_features->preferred[index_arch_Fast_Rep_String] - |= (bit_arch_Fast_Rep_String - | bit_arch_Fast_Unaligned_Load - | bit_arch_Fast_Unaligned_Copy - | bit_arch_Prefer_PMINUB_for_stringop); break; - case INTEL_BIGCORE_NEHALEM: - case INTEL_BIGCORE_WESTMERE: - /* Older CPUs prefer non-temporal stores at lower threshold. */ - cpu_features->cachesize_non_temporal_divisor = 8; - goto enable_modern_features; - - /* Older Bigcore microarch (smaller non-temporal store - threshold). */ - case INTEL_BIGCORE_SANDYBRIDGE: - case INTEL_BIGCORE_IVYBRIDGE: - case INTEL_BIGCORE_HASWELL: - case INTEL_BIGCORE_BROADWELL: - cpu_features->cachesize_non_temporal_divisor = 8; - goto default_tuning; - - /* Newer Bigcore microarch (larger non-temporal store - threshold). */ - case INTEL_BIGCORE_SKYLAKE_AVX512: - case INTEL_BIGCORE_CANNONLAKE: - /* Benchmarks indicate non-temporal memset is not - necessarily profitable on SKX (and in some cases much - worse). This is likely unique to SKX due its it unique - mesh interconnect (not present on ICX or BWD). Disable - non-temporal on all Skylake servers. */ - cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] - |= bit_arch_Avoid_Non_Temporal_Memset; - /* fallthrough */ - case INTEL_BIGCORE_COMETLAKE: - case INTEL_BIGCORE_SKYLAKE: - case INTEL_BIGCORE_KABYLAKE: - case INTEL_BIGCORE_ICELAKE: - case INTEL_BIGCORE_TIGERLAKE: - case INTEL_BIGCORE_ROCKETLAKE: - case INTEL_BIGCORE_RAPTORLAKE: - case INTEL_BIGCORE_METEORLAKE: - case INTEL_BIGCORE_LUNARLAKE: - case INTEL_BIGCORE_ARROWLAKE: - case INTEL_BIGCORE_SAPPHIRERAPIDS: - case INTEL_BIGCORE_EMERALDRAPIDS: - case INTEL_BIGCORE_GRANITERAPIDS: - cpu_features->cachesize_non_temporal_divisor = 2; - goto default_tuning; - - /* Default tuned Mixed (bigcore + atom SOC). */ - case INTEL_MIXED_LAKEFIELD: - case INTEL_MIXED_ALDERLAKE: - cpu_features->cachesize_non_temporal_divisor = 2; - goto default_tuning; - } - - /* Disable TSX on some processors to avoid TSX on kernels that - weren't updated with the latest microcode package (which - disables broken feature by default). */ - switch (microarch) - { case INTEL_BIGCORE_SKYLAKE_AVX512: /* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */ if (stepping <= 5) @@ -927,38 +787,163 @@ init_cpu_features (struct cpu_features *cpu_features) case INTEL_BIGCORE_KABYLAKE: /* NB: Although the errata documents that for model == 0x8e - (kabylake skylake client), only 0xb stepping or lower are - impacted, the intention of the errata was to disable TSX on - all client processors on all steppings. Include 0xc - stepping which is an Intel Core i7-8665U, a client mobile - processor. */ + (kabylake skylake client), only 0xb stepping or lower are + impacted, the intention of the errata was to disable TSX on + all client processors on all steppings. Include 0xc + stepping which is an Intel Core i7-8665U, a client mobile + processor. */ if (stepping > 0xc) break; /* Fall through. */ case INTEL_BIGCORE_SKYLAKE: - /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for - processors listed in: - -https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html - */ - disable_tsx: - CPU_FEATURE_UNSET (cpu_features, HLE); - CPU_FEATURE_UNSET (cpu_features, RTM); - CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT); - break; + /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for + processors listed in: + + https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html + */ +disable_tsx: + CPU_FEATURE_UNSET (cpu_features, HLE); + CPU_FEATURE_UNSET (cpu_features, RTM); + CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT); + break; case INTEL_BIGCORE_HASWELL: - /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working - TSX. Haswell also include other model numbers that have - working TSX. */ - if (model == 0x3f && stepping >= 4) + /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working + TSX. Haswell also includes other model numbers that have + working TSX. */ + if (model == 0x3f && stepping >= 4) break; - CPU_FEATURE_UNSET (cpu_features, RTM); - break; + CPU_FEATURE_UNSET (cpu_features, RTM); + break; } } + else if (family == 19) + switch (model) + { + case 0x01: + microarch = INTEL_BIGCORE_DIAMONDRAPIDS; + break; + default: + break; + } + + switch (microarch) + { + /* Atom / KNL tuning. */ + case INTEL_ATOM_BONNELL: + /* BSF is slow on Bonnell. */ + cpu_features->preferred[index_arch_Slow_BSF] + |= bit_arch_Slow_BSF; + break; + + /* Unaligned load versions are faster than SSSE3 + on Airmont, Silvermont, Goldmont, and Goldmont Plus. */ + case INTEL_ATOM_AIRMONT: + case INTEL_ATOM_SILVERMONT: + case INTEL_ATOM_GOLDMONT: + case INTEL_ATOM_GOLDMONT_PLUS: + + /* Knights Landing. Enable Silvermont optimizations. */ + case INTEL_KNIGHTS_LANDING: + + cpu_features->preferred[index_arch_Fast_Unaligned_Load] + |= (bit_arch_Fast_Unaligned_Load + | bit_arch_Fast_Unaligned_Copy + | bit_arch_Prefer_PMINUB_for_stringop + | bit_arch_Slow_SSE4_2); + break; + + case INTEL_ATOM_TREMONT: + /* Enable rep string instructions, unaligned load, unaligned + copy, pminub and avoid SSE 4.2 on Tremont. */ + cpu_features->preferred[index_arch_Fast_Rep_String] + |= (bit_arch_Fast_Rep_String + | bit_arch_Fast_Unaligned_Load + | bit_arch_Fast_Unaligned_Copy + | bit_arch_Prefer_PMINUB_for_stringop + | bit_arch_Slow_SSE4_2); + break; + + /* + Default tuned Knights microarch. + case INTEL_KNIGHTS_MILL: + */ + + /* + Default tuned atom microarch. + case INTEL_ATOM_SIERRAFOREST: + case INTEL_ATOM_GRANDRIDGE: + case INTEL_ATOM_CLEARWATERFOREST: + */ + + /* Bigcore/Default Tuning. */ + default: + default_tuning: + /* Unknown Intel processors. Assuming this is one of Core + i3/i5/i7 processors if AVX is available. */ + if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) + break; + + enable_modern_features: + /* Rep string instructions, unaligned load, unaligned copy, + and pminub are fast on Intel Core i3, i5 and i7. */ + cpu_features->preferred[index_arch_Fast_Rep_String] + |= (bit_arch_Fast_Rep_String + | bit_arch_Fast_Unaligned_Load + | bit_arch_Fast_Unaligned_Copy + | bit_arch_Prefer_PMINUB_for_stringop); + break; + + case INTEL_BIGCORE_NEHALEM: + case INTEL_BIGCORE_WESTMERE: + /* Older CPUs prefer non-temporal stores at lower threshold. */ + cpu_features->cachesize_non_temporal_divisor = 8; + goto enable_modern_features; + + /* Older Bigcore microarch (smaller non-temporal store + threshold). */ + case INTEL_BIGCORE_SANDYBRIDGE: + case INTEL_BIGCORE_IVYBRIDGE: + case INTEL_BIGCORE_HASWELL: + case INTEL_BIGCORE_BROADWELL: + cpu_features->cachesize_non_temporal_divisor = 8; + goto default_tuning; + + /* Newer Bigcore microarch (larger non-temporal store + threshold). */ + case INTEL_BIGCORE_SKYLAKE_AVX512: + case INTEL_BIGCORE_CANNONLAKE: + /* Benchmarks indicate non-temporal memset is not + necessarily profitable on SKX (and in some cases much + worse). This is likely unique to SKX due to its unique + mesh interconnect (not present on ICX or BWD). Disable + non-temporal on all Skylake servers. */ + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] + |= bit_arch_Avoid_Non_Temporal_Memset; + /* fallthrough */ + case INTEL_BIGCORE_COMETLAKE: + case INTEL_BIGCORE_SKYLAKE: + case INTEL_BIGCORE_KABYLAKE: + case INTEL_BIGCORE_ICELAKE: + case INTEL_BIGCORE_TIGERLAKE: + case INTEL_BIGCORE_ROCKETLAKE: + case INTEL_BIGCORE_RAPTORLAKE: + case INTEL_BIGCORE_METEORLAKE: + case INTEL_BIGCORE_LUNARLAKE: + case INTEL_BIGCORE_ARROWLAKE: + case INTEL_BIGCORE_PANTHERLAKE: + case INTEL_BIGCORE_SAPPHIRERAPIDS: + case INTEL_BIGCORE_EMERALDRAPIDS: + case INTEL_BIGCORE_GRANITERAPIDS: + case INTEL_BIGCORE_DIAMONDRAPIDS: + /* Default tuned Mixed (bigcore + atom SOC). */ + case INTEL_MIXED_LAKEFIELD: + case INTEL_MIXED_ALDERLAKE: + cpu_features->cachesize_non_temporal_divisor = 2; + goto default_tuning; + } /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER if AVX512ER is available. Don't use AVX512 to avoid lower CPU @@ -1159,6 +1144,9 @@ no_cpuid: TUNABLE_CALLBACK (set_prefer_map_32bit_exec)); #endif + /* Do not add the logic to disable XSAVE/XSAVEC if this glibc build + requires AVX and therefore XSAVE or XSAVEC support. */ +#ifndef GCCMACRO__AVX__ bool disable_xsave_features = false; if (!CPU_FEATURE_USABLE_P (cpu_features, OSXSAVE)) @@ -1212,6 +1200,7 @@ no_cpuid: CPU_FEATURE_UNSET (cpu_features, FMA4); } +#endif #ifdef __x86_64__ GLRO(dl_hwcap) = HWCAP_X86_64; diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c index 342317680..d692e0e0d 100644 --- a/sysdeps/x86/cpu-tunables.c +++ b/sysdeps/x86/cpu-tunables.c @@ -164,6 +164,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) /* Update xsave_state_size to XSAVE state size. */ cpu_features->xsave_state_size = cpu_features->xsave_state_full_size; + _dl_x86_features_tlsdesc_state_size + = cpu_features->xsave_state_full_size; CPU_FEATURE_UNSET (cpu_features, XSAVEC); } } diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c index 7d0373602..870b1268d 100644 --- a/sysdeps/x86/dl-diagnostics-cpu.c +++ b/sysdeps/x86/dl-diagnostics-cpu.c @@ -89,6 +89,8 @@ _dl_diagnostics_cpu (void) cpu_features->xsave_state_size); print_cpu_features_value ("xsave_state_full_size", cpu_features->xsave_state_full_size); + print_cpu_features_value ("tlsdesc_state_full_size", + _dl_x86_features_tlsdesc_state_size); print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size); print_cpu_features_value ("shared_cache_size", cpu_features->shared_cache_size); diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h index 9c485d38e..fbf1b8911 100644 --- a/sysdeps/x86/include/cpu-features.h +++ b/sysdeps/x86/include/cpu-features.h @@ -935,8 +935,6 @@ struct cpu_features /* The full state size for XSAVE when XSAVEC is disabled by GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC - - and the AMX state size when XSAVEC is available. */ unsigned int xsave_state_full_size; /* Data cache size for use in memory and string routines, typically @@ -990,6 +988,13 @@ extern const struct cpu_features *_dl_x86_get_cpu_features (void) #define __get_cpu_features() _dl_x86_get_cpu_features() +#if IS_IN (rtld) || IS_IN (libc) +/* XSAVE/XSAVEC state size used by TLS descriptors. Compared to + xsave_state_size from struct cpu_features, this includes additional + registers. */ +extern unsigned long int _dl_x86_features_tlsdesc_state_size attribute_hidden; +#endif + #if defined (_LIBC) && !IS_IN (nonlib) /* Unused for x86. */ # define INIT_ARCH() diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h index 541393f1d..c3c73e75d 100644 --- a/sysdeps/x86/sysdep.h +++ b/sysdeps/x86/sysdep.h @@ -102,6 +102,9 @@ | (1 << X86_XSTATE_ZMM_ID) \ | (1 << X86_XSTATE_APX_F_ID)) +/* The maximum supported xstate ID. */ +# define X86_XSTATE_MAX_ID X86_XSTATE_APX_F_ID + /* AMX state mask. */ # define AMX_STATE_SAVE_MASK \ ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID)) @@ -123,6 +126,9 @@ | (1 << X86_XSTATE_K_ID) \ | (1 << X86_XSTATE_ZMM_H_ID)) +/* The maximum supported xstate ID. */ +# define X86_XSTATE_MAX_ID X86_XSTATE_ZMM_H_ID + /* States to be included in xsave_state_size. */ # define FULL_STATE_SAVE_MASK STATE_SAVE_MASK #endif diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c new file mode 100644 index 000000000..f0024c143 --- /dev/null +++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c @@ -0,0 +1 @@ +#include diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c new file mode 100644 index 000000000..f0024c143 --- /dev/null +++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c @@ -0,0 +1 @@ +#include diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c new file mode 100644 index 000000000..f0024c143 --- /dev/null +++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c @@ -0,0 +1 @@ +#include diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index 9d31685e0..5723ec184 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -142,7 +142,6 @@ CFLAGS-tst-avxmod.c += $(AVX-CFLAGS) AVX512-CFLAGS = -mavx512f CFLAGS-tst-audit10-aux.c += $(AVX512-CFLAGS) CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS) -CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS) CFLAGS-tst-avx512-aux.c += $(AVX512-CFLAGS) CFLAGS-tst-avx512mod.c += $(AVX512-CFLAGS) diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h index 9965ddd2c..4f496de8c 100644 --- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h @@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic: # endif #else /* Allocate stack space of the required size to save the state. */ - sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP + sub _dl_x86_features_tlsdesc_state_size(%rip), %RSP_LP #endif /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9, r10 and r11. */ diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile index e823d2fcc..340342244 100644 --- a/sysdeps/x86_64/fpu/multiarch/Makefile +++ b/sysdeps/x86_64/fpu/multiarch/Makefile @@ -1,15 +1,18 @@ ifeq ($(subdir),math) CFLAGS-e_asin-fma.c = -mfma -mavx2 CFLAGS-e_atan2-fma.c = -mfma -mavx2 +CFLAGS-e_atanh-fma.c = -mfma -mavx2 CFLAGS-e_exp-fma.c = -mfma -mavx2 CFLAGS-e_log-fma.c = -mfma -mavx2 CFLAGS-e_log2-fma.c = -mfma -mavx2 CFLAGS-e_pow-fma.c = -mfma -mavx2 +CFLAGS-e_sinh-fma.c = -mfma -mavx2 CFLAGS-s_atan-fma.c = -mfma -mavx2 CFLAGS-s_expm1-fma.c = -mfma -mavx2 CFLAGS-s_log1p-fma.c = -mfma -mavx2 CFLAGS-s_sin-fma.c = -mfma -mavx2 CFLAGS-s_tan-fma.c = -mfma -mavx2 +CFLAGS-s_tanh-fma.c = -mfma -mavx2 CFLAGS-s_sincos-fma.c = -mfma -mavx2 CFLAGS-s_exp10m1f-fma.c = -mfma -mavx2 CFLAGS-s_exp2m1f-fma.c = -mfma -mavx2 @@ -57,6 +60,7 @@ libm-sysdep_routines += \ e_asin-fma \ e_atan2-avx \ e_atan2-fma \ + e_atanh-fma \ e_exp-avx \ e_exp-fma \ e_exp2f-fma \ @@ -68,6 +72,7 @@ libm-sysdep_routines += \ e_logf-fma \ e_pow-fma \ e_powf-fma \ + e_sinh-fma \ s_atan-avx \ s_atan-fma \ s_ceil-sse4_1 \ @@ -96,6 +101,7 @@ libm-sysdep_routines += \ s_sinf-sse2 \ s_tan-avx \ s_tan-fma \ + s_tanh-fma \ s_trunc-sse4_1 \ s_truncf-sse4_1 \ # libm-sysdep_routines diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c new file mode 100644 index 000000000..c3f2f9e55 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c @@ -0,0 +1,6 @@ +#define __ieee754_atanh __ieee754_atanh_fma +#define __log1p __log1p_fma + +#define SECTION __attribute__ ((section (".text.fma"))) + +#include diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh.c b/sysdeps/x86_64/fpu/multiarch/e_atanh.c new file mode 100644 index 000000000..d2b785dfc --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_atanh.c @@ -0,0 +1,34 @@ +/* Multiple versions of atanh. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL +# include + +extern double __redirect_ieee754_atanh (double); + +# define SYMBOL_NAME ieee754_atanh +# include "ifunc-fma.h" + +libc_ifunc_redirected (__redirect_ieee754_atanh, __ieee754_atanh, IFUNC_SELECTOR ()); + +libm_alias_finite (__ieee754_atanh, __atanh) + +# define __ieee754_atanh __ieee754_atanh_sse2 +#endif +#include diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c new file mode 100644 index 000000000..e0e1e39a7 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c @@ -0,0 +1,12 @@ +#define __ieee754_sinh __ieee754_sinh_fma +#define __ieee754_exp __ieee754_exp_fma +#define __expm1 __expm1_fma + +/* NB: __expm1 may be expanded to __expm1_fma in the following + prototypes. */ +extern long double __expm1l (long double); +extern long double __expm1f128 (long double); + +#define SECTION __attribute__ ((section (".text.fma"))) + +#include diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh.c b/sysdeps/x86_64/fpu/multiarch/e_sinh.c new file mode 100644 index 000000000..3d3c18ccd --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_sinh.c @@ -0,0 +1,35 @@ +/* Multiple versions of sinh. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL +# include + +extern double __redirect_ieee754_sinh (double); + +# define SYMBOL_NAME ieee754_sinh +# include "ifunc-fma.h" + +libc_ifunc_redirected (__redirect_ieee754_sinh, __ieee754_sinh, + IFUNC_SELECTOR ()); + +libm_alias_finite (__ieee754_sinh, __sinh) + +# define __ieee754_sinh __ieee754_sinh_sse2 +#endif +#include diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c new file mode 100644 index 000000000..1b808b122 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c @@ -0,0 +1,11 @@ +#define __tanh __tanh_fma +#define __expm1 __expm1_fma + +/* NB: __expm1 may be expanded to __expm1_fma in the following + prototypes. */ +extern long double __expm1l (long double); +extern long double __expm1f128 (long double); + +#define SECTION __attribute__ ((section (".text.fma"))) + +#include diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh.c b/sysdeps/x86_64/fpu/multiarch/s_tanh.c new file mode 100644 index 000000000..5539b6c61 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_tanh.c @@ -0,0 +1,31 @@ +/* Multiple versions of tanh. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL + +extern double __redirect_tanh (double); + +# define SYMBOL_NAME tanh +# include "ifunc-fma.h" + +libc_ifunc_redirected (__redirect_tanh, __tanh, IFUNC_SELECTOR ()); + +# define __tanh __tanh_sse2 +#endif +#include diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index a8349775d..c2dcadd1a 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -922,7 +922,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (BMI2)), __wcsncpy_avx2) - X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy, + X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy, 1, __wcsncpy_generic)) @@ -952,7 +952,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (BMI2)), __wcpncpy_avx2) - X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy, + X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy, 1, __wcpncpy_generic)) diff --git a/sysdeps/x86_64/tst-auditmod10b.c b/sysdeps/x86_64/tst-auditmod10b.c index 6eb21b6f0..0b994ef0f 100644 --- a/sysdeps/x86_64/tst-auditmod10b.c +++ b/sysdeps/x86_64/tst-auditmod10b.c @@ -125,7 +125,6 @@ la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, #include -#ifdef __AVX512F__ #include #include @@ -148,9 +147,37 @@ check_avx512 (void) return (eax & 0xe6) == 0xe6; } -#else -#include -#endif +void +__attribute__ ((target ("avx512f"))) +pltenter_avx512f (La_regs *regs, long int *framesizep) +{ + __m512i zero = _mm512_setzero_si512 (); + if (memcmp (®s->lr_vector[0], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[1], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[2], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[3], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[4], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[5], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[6], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[7], &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + regs->lr_vector[i].zmm[0] + = (La_x86_64_zmm) _mm512_set1_epi64 (i + 1); + + __m512i zmm = _mm512_set1_epi64 (-1); + asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" ); + asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" ); + asm volatile ("vmovdqa64 %0, %%zmm2" : : "x" (zmm) : "xmm2" ); + asm volatile ("vmovdqa64 %0, %%zmm3" : : "x" (zmm) : "xmm3" ); + asm volatile ("vmovdqa64 %0, %%zmm4" : : "x" (zmm) : "xmm4" ); + asm volatile ("vmovdqa64 %0, %%zmm5" : : "x" (zmm) : "xmm5" ); + asm volatile ("vmovdqa64 %0, %%zmm6" : : "x" (zmm) : "xmm6" ); + asm volatile ("vmovdqa64 %0, %%zmm7" : : "x" (zmm) : "xmm7" ); + + *framesizep = 1024; +} ElfW(Addr) pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, @@ -160,39 +187,33 @@ pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", symname, (long int) sym->st_value, ndx, *flags); -#ifdef __AVX512F__ if (check_avx512 () && strcmp (symname, "audit_test") == 0) + pltenter_avx512f (regs, framesizep); + + return sym->st_value; +} + +void +__attribute__ ((target ("avx512f"))) +pltexit_avx512f (const La_regs *inregs, La_retval *outregs) +{ + __m512i zero = _mm512_setzero_si512 (); + if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) { - __m512i zero = _mm512_setzero_si512 (); - if (memcmp (®s->lr_vector[0], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[1], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[2], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[3], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[4], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[5], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[6], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[7], &zero, sizeof (zero))) - abort (); - - for (int i = 0; i < 8; i++) - regs->lr_vector[i].zmm[0] - = (La_x86_64_zmm) _mm512_set1_epi64 (i + 1); - - __m512i zmm = _mm512_set1_epi64 (-1); - asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" ); - asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" ); - asm volatile ("vmovdqa64 %0, %%zmm2" : : "x" (zmm) : "xmm2" ); - asm volatile ("vmovdqa64 %0, %%zmm3" : : "x" (zmm) : "xmm3" ); - asm volatile ("vmovdqa64 %0, %%zmm4" : : "x" (zmm) : "xmm4" ); - asm volatile ("vmovdqa64 %0, %%zmm5" : : "x" (zmm) : "xmm5" ); - asm volatile ("vmovdqa64 %0, %%zmm6" : : "x" (zmm) : "xmm6" ); - asm volatile ("vmovdqa64 %0, %%zmm7" : : "x" (zmm) : "xmm7" ); - - *framesizep = 1024; + __m512i zmm = _mm512_set1_epi64 (i + 1); + if (memcmp (&inregs->lr_vector[i], &zmm, sizeof (zmm)) != 0) + abort (); } -#endif - return sym->st_value; + outregs->lrv_vector0.zmm[0] + = (La_x86_64_zmm) _mm512_set1_epi64 (0x12349876); + + __m512i zmm = _mm512_set1_epi64 (-1); + asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" ); + asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" ); } unsigned int @@ -204,28 +225,8 @@ pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, symname, (long int) sym->st_value, ndx, (ptrdiff_t) outregs->int_retval); -#ifdef __AVX512F__ if (check_avx512 () && strcmp (symname, "audit_test") == 0) - { - __m512i zero = _mm512_setzero_si512 (); - if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero))) - abort (); - - for (int i = 0; i < 8; i++) - { - __m512i zmm = _mm512_set1_epi64 (i + 1); - if (memcmp (&inregs->lr_vector[i], &zmm, sizeof (zmm)) != 0) - abort (); - } - - outregs->lrv_vector0.zmm[0] - = (La_x86_64_zmm) _mm512_set1_epi64 (0x12349876); - - __m512i zmm = _mm512_set1_epi64 (-1); - asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" ); - asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" ); - } -#endif + pltexit_avx512f (inregs, outregs); return 0; } -- 2.30.2